mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-29 05:22:55 +00:00
Compare commits
46 Commits
0.9
...
bump-versi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a3e520bfe | ||
|
|
bcd7386fc5 | ||
|
|
c23a7c992b | ||
|
|
2a88094ec4 | ||
|
|
ca3cfddab4 | ||
|
|
7bd9f9773b | ||
|
|
e2da92fcb5 | ||
|
|
876e1451c4 | ||
|
|
a37d2f9777 | ||
|
|
4822940b19 | ||
|
|
d590f4c6b0 | ||
|
|
edfa619519 | ||
|
|
96f194635f | ||
|
|
444662485f | ||
|
|
943c25d0f8 | ||
|
|
5c0b2a4579 | ||
|
|
9870a9258d | ||
|
|
7102b363f5 | ||
|
|
66b4615e4e | ||
|
|
da46913839 | ||
|
|
3df037961f | ||
|
|
8ffae47854 | ||
|
|
1a90a1f3b0 | ||
|
|
dac50c6aeb | ||
|
|
31b22c5acc | ||
|
|
8e50921363 | ||
|
|
96a4f503ec | ||
|
|
9df288b0c9 | ||
|
|
b7c2d0de97 | ||
|
|
62445e0ec8 | ||
|
|
a228825462 | ||
|
|
d3eabd14bc | ||
|
|
c967031d21 | ||
|
|
d823163d52 | ||
|
|
c4f59f202d | ||
|
|
acd29b535d | ||
|
|
2cd31bcda2 | ||
|
|
99870de55c | ||
|
|
cad2d91845 | ||
|
|
79f3cd6cf4 | ||
|
|
e3abb4481b | ||
|
|
bfa61d2f2f | ||
|
|
6c0e621fdb | ||
|
|
a8cc5208f1 | ||
|
|
83eb0d0cb7 | ||
|
|
ee6e273365 |
23
.travis.yml
23
.travis.yml
@@ -10,7 +10,7 @@ env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
@@ -29,7 +29,7 @@ addons:
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
@@ -38,12 +38,12 @@ matrix:
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
|
||||
# OSX
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
@@ -52,6 +52,7 @@ before_install:
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
- env | grep "TRAVIS"
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
@@ -64,10 +65,20 @@ script:
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
after_success:
|
||||
# Needs GH_TOKEN env var to be set in travis settings
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
- find ./target/debug -type f -maxdepth 1 -delete
|
||||
- rm -f ./target/.rustc_info.json
|
||||
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
- rm -r target/debug/examples/
|
||||
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
@@ -77,4 +88,4 @@ before_cache:
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
on_success: never
|
||||
|
||||
70
CHANGELOG.md
70
CHANGELOG.md
@@ -1,3 +1,44 @@
|
||||
Tantivy 0.10.0
|
||||
=====================
|
||||
|
||||
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
|
||||
|
||||
- Added an ASCII folding filter (@drusellers)
|
||||
- Bugfix in `query.count` in presence of deletes (@pmasurel)
|
||||
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
|
||||
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
|
||||
All segments are simply removed.
|
||||
|
||||
Minor
|
||||
---------
|
||||
- Small simplification of the code.
|
||||
Calling .freq() or .doc() when .advance() has never been called
|
||||
on segment postings should panic from now on.
|
||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||
- `IndexMeta` is now public. (@hntd187)
|
||||
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
|
||||
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||
only require a read lock. (@pmasurel)
|
||||
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
|
||||
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
|
||||
|
||||
## How to update?
|
||||
|
||||
Your existing indexes are usable as is, but you may need some
|
||||
trivial updates.
|
||||
|
||||
### Fast fields
|
||||
|
||||
Fast fields used to be accessed directly from the `SegmentReader`.
|
||||
The API changed, you are now required to acquire your fast field reader via the
|
||||
`segment_reader.fast_fields()`, and use one of the typed method:
|
||||
- `.u64()`, `.i64()` if your field is single-valued ;
|
||||
- `.u64s()`, `.i64s()` if your field is multi-valued ;
|
||||
- `.bytes()` if your field is bytes fast field.
|
||||
|
||||
|
||||
|
||||
Tantivy 0.9.0
|
||||
=====================
|
||||
*0.9.0 index format is not compatible with the
|
||||
@@ -15,6 +56,35 @@ previous index format.*
|
||||
for int fields. (@fulmicoton)
|
||||
- Added DateTime field (@barrotsteindev)
|
||||
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
||||
- SIMD linear search within blocks (@fulmicoton)
|
||||
|
||||
## How to update ?
|
||||
|
||||
tantivy 0.9 brought some API breaking change.
|
||||
To update from tantivy 0.8, you will need to go through the following steps.
|
||||
|
||||
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
|
||||
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
|
||||
`IndexReader` for this.
|
||||
|
||||
```rust
|
||||
// create the reader. You typically need to create 1 reader for the entire
|
||||
// lifetime of you program.
|
||||
let reader = index.reader()?;
|
||||
|
||||
// Acquire a searcher (previously `index.searcher()`) is now written:
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// With the default setting of the reader, you are not required to
|
||||
// call `index.load_searchers()` anymore.
|
||||
//
|
||||
// The IndexReader will pick up that change automatically, regardless
|
||||
// of whether the update was done in a different process or not.
|
||||
// If this behavior is not wanted, you can create your reader with
|
||||
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
|
||||
// by calling `reader.reload()?`.
|
||||
|
||||
```
|
||||
|
||||
|
||||
Tantivy 0.8.2
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.9.0"
|
||||
version = "0.10.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -23,7 +23,7 @@ snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
combine = "3"
|
||||
combine = ">=3.6.0,<4.0.0"
|
||||
tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
@@ -42,7 +42,7 @@ owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1.1"
|
||||
downcast-rs = { version="1.0" }
|
||||
bitpacking = "0.6"
|
||||
bitpacking = "0.7"
|
||||
census = "0.2"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
@@ -54,7 +54,7 @@ murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
winapi = "0.3"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.6"
|
||||
|
||||
22
README.md
22
README.md
@@ -4,6 +4,7 @@
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
[](https://saythanks.io/to/fulmicoton)
|
||||
|
||||

|
||||
@@ -17,6 +18,7 @@
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
@@ -27,6 +29,14 @@ to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results will depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
@@ -87,6 +97,14 @@ To check out and run tests, you can simply run :
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
# Contribute
|
||||
# How can I support this project ?
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
There are many ways to support this project.
|
||||
|
||||
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Complete documentation
|
||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
||||
- Talk about tantivy around you
|
||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
@@ -18,8 +18,8 @@ use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::Index;
|
||||
use tantivy::SegmentReader;
|
||||
use tantivy::{Index, TantivyError};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
@@ -75,9 +75,18 @@ impl Collector for StatsCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> tantivy::Result<StatsSegmentCollector> {
|
||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
||||
let fast_field_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.ok_or_else(|| {
|
||||
let field_name = segment_reader.schema().get_field_name(self.field);
|
||||
TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a u64 fast field.",
|
||||
field_name
|
||||
))
|
||||
})?;
|
||||
Ok(StatsSegmentCollector {
|
||||
fast_field_reader,
|
||||
stats: Stats::default(),
|
||||
|
||||
107
examples/multiple_producer.rs
Normal file
107
examples/multiple_producer.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
// # Indexing from different threads.
|
||||
//
|
||||
// It is fairly common to have to index from different threads.
|
||||
// Tantivy forbids to create more than one `IndexWriter` at a time.
|
||||
//
|
||||
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
|
||||
// indexing threads will not help. However, it can still be useful for some applications.
|
||||
//
|
||||
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
|
||||
// your application, it is reasonable to have multiple threads.
|
||||
//
|
||||
// Another very common reason to want to index from multiple threads, is implementing a webserver
|
||||
// with CRUD capabilities. The server framework will most likely handle request from
|
||||
// different threads.
|
||||
//
|
||||
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
|
||||
// `Arc<RwLock<IndexWriter>>`.
|
||||
//
|
||||
// While this is counterintuitive, adding and deleting documents do not require mutability
|
||||
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
|
||||
//
|
||||
// The example below does not represent an actual real-life use case (who would spawn thread to
|
||||
// index a single document?), but aims at demonstrating the mechanism that makes indexing
|
||||
// from several threads possible.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::Opstamp;
|
||||
use tantivy::{Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
|
||||
|
||||
// # First indexing thread.
|
||||
let index_writer_clone_1 = index_writer.clone();
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
let opstamp = {
|
||||
// A read lock is sufficient here.
|
||||
let index_writer_rlock = index_writer_clone_1.read().unwrap();
|
||||
index_writer_rlock.add_document(
|
||||
doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
let index_writer_clone_2 = index_writer.clone();
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
// A read lock is sufficient here.
|
||||
let opstamp = {
|
||||
let index_writer_rlock = index_writer_clone_2.read().unwrap();
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -17,6 +17,7 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use TantivyError;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -264,7 +265,10 @@ impl Collector for FacetCollector {
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
let field_name = reader.schema().get_field_name(self.field);
|
||||
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
|
||||
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
|
||||
})?;
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
|
||||
@@ -2,6 +2,7 @@ use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use collector::Fruit;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Deref;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -199,7 +200,10 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
||||
self.collector_wrappers
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.any(Collector::requires_scoring)
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||
|
||||
@@ -114,11 +114,15 @@ impl Collector for FastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<FastFieldSegmentCollector> {
|
||||
let reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.expect("Requested field is not a fast field.");
|
||||
Ok(FastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -170,11 +174,14 @@ impl Collector for BytesFastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
reader: segment_reader
|
||||
.fast_fields()
|
||||
.bytes(self.field)
|
||||
.expect("Field is not a bytes fast field."),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -191,7 +198,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let data = self.reader.get_val(doc);
|
||||
let data = self.reader.get_bytes(doc);
|
||||
self.vals.extend(data);
|
||||
}
|
||||
|
||||
|
||||
@@ -98,11 +98,11 @@ where
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub(crate) fn for_segment(
|
||||
pub(crate) fn for_segment<F: PartialOrd>(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
_: &SegmentReader,
|
||||
) -> Result<TopSegmentCollector<T>> {
|
||||
) -> Result<TopSegmentCollector<F>> {
|
||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,10 +5,12 @@ use collector::SegmentCollector;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use std::marker::PhantomData;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use TantivyError;
|
||||
|
||||
/// The Top Field Collector keeps track of the K documents
|
||||
/// sorted by a fast field in the index
|
||||
@@ -106,8 +108,15 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TopFieldSegmentCollector<T>> {
|
||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||
let reader = reader.fast_field_reader(self.field)?;
|
||||
Ok(TopFieldSegmentCollector { collector, reader })
|
||||
let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
|
||||
let field_name = reader.schema().get_field_name(self.field);
|
||||
TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
|
||||
})?;
|
||||
Ok(TopFieldSegmentCollector {
|
||||
collector,
|
||||
reader,
|
||||
_type: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -122,9 +131,10 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
||||
collector: TopSegmentCollector<T>,
|
||||
reader: FastFieldReader<T>,
|
||||
pub struct TopFieldSegmentCollector<T> {
|
||||
collector: TopSegmentCollector<u64>,
|
||||
reader: FastFieldReader<u64>,
|
||||
_type: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||
@@ -138,7 +148,11 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||
self.collector.harvest()
|
||||
self.collector
|
||||
.harvest()
|
||||
.into_iter()
|
||||
.map(|(val, doc_address)| (T::from_u64(val), doc_address))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,7 +249,7 @@ mod tests {
|
||||
.for_segment(0, segment)
|
||||
.map(|_| ())
|
||||
.unwrap_err(),
|
||||
TantivyError::FastFieldError(_)
|
||||
TantivyError::SchemaError(_)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,10 @@ pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
@@ -52,11 +55,6 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
|
||||
(n > 0) && (n & (n - 1) == 0)
|
||||
}
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
@@ -134,4 +132,11 @@ pub(crate) mod test {
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_doc() {
|
||||
// this is the first time I write a unit test for a constant.
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
#[cfg(feature = "mmap")]
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
@@ -339,7 +340,7 @@ impl Index {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.map(SegmentMeta::id)
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
@@ -355,10 +356,8 @@ mod tests {
|
||||
use directory::RAMDirectory;
|
||||
use schema::Field;
|
||||
use schema::{Schema, INDEXED, TEXT};
|
||||
use std::path::PathBuf;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tempdir::TempDir;
|
||||
use Index;
|
||||
use IndexReader;
|
||||
use IndexWriter;
|
||||
@@ -444,61 +443,69 @@ mod tests {
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_specific {
|
||||
|
||||
#[test]
|
||||
fn test_index_manual_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 1);
|
||||
}
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use tempdir::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
let reader = read_index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_manual_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
let reader = read_index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_index_on_commit_reload_policy_aux(
|
||||
@@ -530,4 +537,35 @@ mod tests {
|
||||
}
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
let directory = RAMDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema).unwrap();
|
||||
|
||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||
for i in 0u64..8_000u64 {
|
||||
writer.add_document(doc!(field => i));
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
let mem_right_after_commit = directory.total_mem_usage();
|
||||
thread::sleep(Duration::from_millis(1_000));
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||
writer.wait_merging_threads().unwrap();
|
||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 8_000);
|
||||
assert!(mem_right_after_merge_finished < mem_right_after_commit);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use core::SegmentMeta;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use Opstamp;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -13,14 +14,27 @@ use std::fmt;
|
||||
///
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
/// Index `Schema`
|
||||
pub schema: Schema,
|
||||
pub opstamp: u64,
|
||||
/// Opstamp associated to the last `commit` operation.
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// Payload associated to the last commit.
|
||||
///
|
||||
/// Upon commit, clients can optionally add a small `Striing` payload to their commit
|
||||
/// to help identify this commit.
|
||||
/// This payload is entirely unused by tantivy.
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
/// Create an `IndexMeta` object representing a brand new `Index`
|
||||
/// with the given index.
|
||||
///
|
||||
/// This new index does not contains any segments.
|
||||
/// Opstamp will the value `0u64`.
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: vec![],
|
||||
|
||||
@@ -59,7 +59,7 @@ impl Searcher {
|
||||
) -> Searcher {
|
||||
let store_readers = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader())
|
||||
.map(SegmentReader::get_store_reader)
|
||||
.collect();
|
||||
Searcher {
|
||||
schema,
|
||||
@@ -218,7 +218,7 @@ impl fmt::Debug for Searcher {
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "Searcher({:?})", segment_ids)
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use schema::Schema;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::result;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
@@ -50,7 +51,7 @@ impl Segment {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
|
||||
@@ -5,6 +5,7 @@ use serde;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use Opstamp;
|
||||
|
||||
lazy_static! {
|
||||
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
|
||||
@@ -13,7 +14,7 @@ lazy_static! {
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: u64,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
@@ -136,9 +137,9 @@ impl SegmentMeta {
|
||||
self.max_doc() - self.num_deleted_docs()
|
||||
}
|
||||
|
||||
/// Returns the opstamp of the last delete operation
|
||||
/// Returns the `Opstamp` of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
@@ -152,7 +153,7 @@ impl SegmentMeta {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
|
||||
@@ -5,14 +5,10 @@ use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use directory::ReadOnlySource;
|
||||
use error::TantivyError;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fastfield::FastFieldReaders;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
@@ -51,7 +47,7 @@ pub struct SegmentReader {
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_composite: CompositeFile,
|
||||
fast_fields_readers: Arc<FastFieldReaders>,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_source: ReadOnlySource,
|
||||
@@ -105,93 +101,21 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
pub fn fast_fields(&self) -> &FastFieldReaders {
|
||||
&self.fast_fields_readers
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)));
|
||||
return None;
|
||||
}
|
||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field)?;
|
||||
let termdict = TermDictionary::from_source(&termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
Some(facet_reader)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
@@ -247,8 +171,12 @@ impl SegmentReader {
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
@@ -260,14 +188,13 @@ impl SegmentReader {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_source,
|
||||
@@ -319,7 +246,7 @@ impl SegmentReader {
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed?");
|
||||
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
@@ -381,12 +308,12 @@ impl SegmentReader {
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.get_store_reader().space_usage(),
|
||||
self.delete_bitset_opt
|
||||
.as_ref()
|
||||
.map(|x| x.space_usage())
|
||||
.map(DeleteBitSet::space_usage)
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -48,14 +48,14 @@ impl RetryPolicy {
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
||||
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
directory: Box<Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
||||
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
fn from(underlying: Box<T>) -> Self {
|
||||
DirectoryLock(underlying)
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::path::PathBuf;
|
||||
/// Error while trying to acquire a directory lock.
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum LockError {
|
||||
/// Failed to acquired a lock as it is already hold by another
|
||||
/// Failed to acquired a lock as it is already held by another
|
||||
/// client.
|
||||
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||
|
||||
@@ -260,95 +260,98 @@ impl Clone for ManagedDirectory {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
mod mmap_specific {
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
}
|
||||
use super::super::*;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
}
|
||||
|
||||
use directory::MmapDirectory;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
{
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap ")]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use self::notify::RawEvent;
|
||||
use self::notify::RecursiveMode;
|
||||
use self::notify::Watcher;
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use core::META_FILEPATH;
|
||||
use directory::error::LockError;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
@@ -37,6 +36,11 @@ use std::sync::Weak;
|
||||
use std::thread;
|
||||
use tempdir::TempDir;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped)
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
@@ -316,7 +320,7 @@ impl MmapDirectory {
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
|
||||
@@ -86,7 +86,7 @@ impl InnerDirectory {
|
||||
self.fs
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(|el| el.clone())
|
||||
.map(Clone::clone)
|
||||
}
|
||||
|
||||
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
@@ -103,6 +103,10 @@ impl InnerDirectory {
|
||||
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
|
||||
self.watch_router.subscribe(watch_handle)
|
||||
}
|
||||
|
||||
fn total_mem_usage(&self) -> usize {
|
||||
self.fs.values().map(|f| f.len()).sum()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RAMDirectory {
|
||||
@@ -126,6 +130,12 @@ impl RAMDirectory {
|
||||
pub fn create() -> RAMDirectory {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns the sum of the size of the different files
|
||||
/// in the RAMDirectory.
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use common::BitSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
@@ -95,9 +96,23 @@ pub trait DocSet {
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
///
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
if !delete_bitset.is_deleted(self.doc()) {
|
||||
count += 1u32;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Returns the count of documents, deleted or not.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
///
|
||||
/// Of course, the result is an upper bound of the result
|
||||
/// given by `count()`.
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
count += 1u32;
|
||||
@@ -127,9 +142,14 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count()
|
||||
unboxed.count(delete_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count_including_deleted()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
|
||||
@@ -23,14 +23,14 @@ mod tests {
|
||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
|
||||
|
||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_val(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
||||
assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_bytes(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
|
||||
let long = vec![0u8; 1000];
|
||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
||||
assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use DocId;
|
||||
///
|
||||
/// Reading the value for a document is done by reading the start index for it,
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values: OwningRef<ReadOnlySource, [u8]>,
|
||||
@@ -28,10 +29,20 @@ impl BytesFastFieldReader {
|
||||
BytesFastFieldReader { idx_reader, values }
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
||||
fn range(&self, doc: DocId) -> (usize, usize) {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
|
||||
let (start, stop) = self.range(doc);
|
||||
&self.values[start..stop]
|
||||
}
|
||||
|
||||
/// Returns the overall number of bytes in this bytes fast field.
|
||||
pub fn total_num_bytes(&self) -> usize {
|
||||
self.values.len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,16 +53,18 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether the document has been marked as deleted.
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
!self.is_deleted(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline(always)]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
} else {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
|
||||
@@ -30,6 +30,7 @@ pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use common;
|
||||
@@ -43,6 +44,7 @@ mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
@@ -78,10 +80,6 @@ impl FastValue for u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
@@ -89,6 +87,10 @@ impl FastValue for u64 {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
|
||||
@@ -37,9 +37,7 @@ mod tests {
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader
|
||||
.multi_fast_field_reader::<u64>(field)
|
||||
.unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
@@ -198,9 +196,9 @@ mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[-4i64]);
|
||||
|
||||
@@ -26,6 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
|
||||
MultiValueIntFastFieldReader {
|
||||
idx_reader: self.idx_reader,
|
||||
vals_reader: self.vals_reader.into_u64_reader(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
@@ -41,13 +48,24 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
(stop - start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, Schema};
|
||||
use schema::{Facet, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
@@ -58,22 +76,12 @@ mod tests {
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 30_000_000)
|
||||
.expect("Failed to create index writer.");
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
doc.add_facet(facet_field, "/category/cat1");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat3");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from("/category/cat2"),
|
||||
facet_field => Facet::from("/category/cat1"),
|
||||
));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
@@ -50,6 +50,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
|
||||
FastFieldReader {
|
||||
bit_unpacker: self.bit_unpacker,
|
||||
min_value_u64: self.min_value_u64,
|
||||
max_value_u64: self.max_value_u64,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
|
||||
191
src/fastfield/readers.rs
Normal file
191
src/fastfield/readers.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use common::CompositeFile;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::MultiValueIntFastFieldReader;
|
||||
use fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||
use schema::{Cardinality, Field, FieldType, Schema};
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
|
||||
/// Provides access to all of the FastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
/// and just wraps several `HashMap`.
|
||||
pub struct FastFieldReaders {
|
||||
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
|
||||
enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
}
|
||||
|
||||
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
match field_type {
|
||||
FieldType::U64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::U64, cardinality)),
|
||||
FieldType::I64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::I64, cardinality)),
|
||||
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReaders {
|
||||
pub(crate) fn load_all(
|
||||
schema: &Schema,
|
||||
fast_fields_composite: &CompositeFile,
|
||||
) -> Result<FastFieldReaders> {
|
||||
let mut fast_field_readers = FastFieldReaders {
|
||||
fast_field_i64: Default::default(),
|
||||
fast_field_u64: Default::default(),
|
||||
fast_field_i64s: Default::default(),
|
||||
fast_field_u64s: Default::default(),
|
||||
fast_bytes: Default::default(),
|
||||
fast_fields_composite: fast_fields_composite.clone(),
|
||||
};
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let field_type = field_entry.field_type();
|
||||
if field_type == &FieldType::Bytes {
|
||||
let idx_reader = fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
fast_field_readers
|
||||
.fast_bytes
|
||||
.insert(field, BytesFastFieldReader::open(idx_reader, data));
|
||||
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => {
|
||||
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
|
||||
match fast_type {
|
||||
FastType::U64 => {
|
||||
let fast_field_reader = FastFieldReader::open(fast_field_data);
|
||||
fast_field_readers
|
||||
.fast_field_u64
|
||||
.insert(field, fast_field_reader);
|
||||
}
|
||||
FastType::I64 => {
|
||||
fast_field_readers.fast_field_i64.insert(
|
||||
field,
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
}
|
||||
}
|
||||
Cardinality::MultiValues => {
|
||||
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
|
||||
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
|
||||
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
|
||||
let idx_reader = FastFieldReader::open(fast_field_idx);
|
||||
match fast_type {
|
||||
FastType::I64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_i64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
FastType::U64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_u64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(fast_field_readers)
|
||||
}
|
||||
|
||||
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
self.fast_fields_composite.space_usage()
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns `None`.
|
||||
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
self.fast_field_u64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// If the field is a u64-fast field return the associated reader.
|
||||
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
if let Some(u64_ff_reader) = self.u64(field) {
|
||||
return Some(u64_ff_reader);
|
||||
}
|
||||
if let Some(i64_ff_reader) = self.i64(field) {
|
||||
return Some(i64_ff_reader.into_u64_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
|
||||
self.fast_field_i64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||
self.fast_field_u64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// If the field is a u64s-fast field return the associated reader.
|
||||
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||
if let Some(u64s_ff_reader) = self.u64s(field) {
|
||||
return Some(u64s_ff_reader);
|
||||
}
|
||||
if let Some(i64s_ff_reader) = self.i64s(field) {
|
||||
return Some(i64s_ff_reader.into_u64s_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
|
||||
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
|
||||
self.fast_field_i64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `bytes` fast field reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns `None`.
|
||||
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
|
||||
self.fast_bytes.get(&field).cloned()
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,6 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ use super::operation::DeleteOperation;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Opstamp;
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
@@ -184,7 +185,7 @@ impl DeleteCursor {
|
||||
/// queue are consume and the next get will return None.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
while self.is_behind_opstamp(target_opstamp) {
|
||||
self.advance();
|
||||
@@ -192,7 +193,7 @@ impl DeleteCursor {
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
.unwrap_or(false)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use DocId;
|
||||
use Opstamp;
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
@@ -23,7 +24,7 @@ pub enum DocToOpstampMapping {
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
|
||||
fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(Arc::new(opstamps))
|
||||
}
|
||||
}
|
||||
@@ -35,7 +36,7 @@ impl DocToOpstampMapping {
|
||||
//
|
||||
// The edge case opstamp = some doc opstamp is in practise
|
||||
// never called.
|
||||
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
|
||||
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
|
||||
match *self {
|
||||
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
||||
match doc_opstamps.binary_search(&target_opstamp) {
|
||||
|
||||
@@ -30,6 +30,7 @@ use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
@@ -99,7 +100,7 @@ pub struct IndexWriter {
|
||||
delete_queue: DeleteQueue,
|
||||
|
||||
stamper: Stamper,
|
||||
committed_opstamp: u64,
|
||||
committed_opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
@@ -177,7 +178,7 @@ pub fn compute_deleted_bitset(
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<bool> {
|
||||
let mut might_have_changed = false;
|
||||
|
||||
@@ -219,7 +220,7 @@ pub fn compute_deleted_bitset(
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<()> {
|
||||
{
|
||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||
@@ -299,11 +300,11 @@ fn index_documents(
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
|
||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||
|
||||
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
@@ -331,7 +332,8 @@ fn index_documents(
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
/// The index writer
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
/// then drop the `IndexWriter`.
|
||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||
// this will stop the indexing thread,
|
||||
// dropping the last reference to the segment_updater.
|
||||
@@ -382,7 +384,6 @@ impl IndexWriter {
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
/// The thread consumes documents from the pipeline.
|
||||
///
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.operation_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
@@ -461,6 +462,52 @@ impl IndexWriter {
|
||||
self.segment_updater.garbage_collect_files()
|
||||
}
|
||||
|
||||
/// Deletes all documents from the index
|
||||
///
|
||||
/// Requires `commit`ing
|
||||
/// Enables users to rebuild the index,
|
||||
/// by clearing and resubmitting necessary documents
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::*;
|
||||
/// use tantivy::Index;
|
||||
///
|
||||
/// fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
/// let schema = schema_builder.build();
|
||||
///
|
||||
/// let index = Index::create_in_ram(schema.clone());
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||
/// // have to commit, otherwise deleted terms remain available
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let searcher = index.reader()?.searcher();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query_promo = query_parser.parse_query("Promotheus")?;
|
||||
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
||||
///
|
||||
/// assert!(top_docs_promo.is_empty());
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub fn delete_all_documents(&mut self) -> Result<Opstamp> {
|
||||
// Delete segments
|
||||
self.segment_updater.remove_all_segments();
|
||||
// Return new stamp - reverted stamp
|
||||
self.stamper.revert(self.committed_opstamp);
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
///
|
||||
/// `segment_ids` is required to be non-empty.
|
||||
@@ -488,19 +535,22 @@ impl IndexWriter {
|
||||
|
||||
/// Rollback to the last commit
|
||||
///
|
||||
/// This cancels all of the update that
|
||||
/// happened before after the last commit.
|
||||
/// This cancels all of the updates that
|
||||
/// happened after the last commit.
|
||||
/// After calling rollback, the index is in the same
|
||||
/// state as it was after the last commit.
|
||||
///
|
||||
/// The opstamp at the last commit is returned.
|
||||
pub fn rollback(&mut self) -> Result<()> {
|
||||
pub fn rollback(&mut self) -> Result<Opstamp> {
|
||||
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
||||
self.rollback_impl()
|
||||
}
|
||||
|
||||
/// Private, implementation of rollback
|
||||
fn rollback_impl(&mut self) -> Result<Opstamp> {
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
|
||||
let document_receiver = self.operation_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
@@ -529,7 +579,7 @@ impl IndexWriter {
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver.clone() {}
|
||||
|
||||
Ok(())
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Prepares a commit.
|
||||
@@ -567,7 +617,7 @@ impl IndexWriter {
|
||||
info!("Preparing commit");
|
||||
|
||||
// this will drop the current document channel
|
||||
// and recreate a new one channels.
|
||||
// and recreate a new one.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
@@ -601,7 +651,7 @@ impl IndexWriter {
|
||||
/// Commit returns the `opstamp` of the last document
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> Result<u64> {
|
||||
pub fn commit(&mut self) -> Result<Opstamp> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
|
||||
@@ -617,7 +667,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// only after calling `commit()`.
|
||||
pub fn delete_term(&mut self, term: Term) -> u64 {
|
||||
pub fn delete_term(&self, term: Term) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
self.delete_queue.push(delete_operation);
|
||||
@@ -631,7 +681,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// This is also the opstamp of the commit that is currently
|
||||
/// available for searchers.
|
||||
pub fn commit_opstamp(&self) -> u64 {
|
||||
pub fn commit_opstamp(&self) -> Opstamp {
|
||||
self.committed_opstamp
|
||||
}
|
||||
|
||||
@@ -645,7 +695,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Currently it represents the number of documents that
|
||||
/// have been added since the creation of the index.
|
||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
||||
@@ -662,7 +712,7 @@ impl IndexWriter {
|
||||
/// The total number of stamps generated by this method is `count + 1`;
|
||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||
/// is for the batch itself.
|
||||
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
|
||||
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||
let last_opstamp = end - 1;
|
||||
let stamps = Range {
|
||||
@@ -688,7 +738,7 @@ impl IndexWriter {
|
||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
|
||||
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||
let count = user_operations.len() as u64;
|
||||
if count == 0 {
|
||||
return self.stamper.stamp();
|
||||
@@ -739,7 +789,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let operations = vec![
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
@@ -801,7 +851,7 @@ mod tests {
|
||||
fn test_empty_operations_group() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1);
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
@@ -1048,4 +1098,145 @@ mod tests {
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add_then_delete_all_documents() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term)
|
||||
};
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
let commit_tstamp = index_writer.commit().unwrap();
|
||||
assert!(commit_tstamp > add_tstamp);
|
||||
index_writer.delete_all_documents().unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
// Search for documents with the same term that we added
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_rollback_correct_stamp() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
|
||||
// commit documents - they are now available
|
||||
let first_commit = index_writer.commit();
|
||||
assert!(first_commit.is_ok());
|
||||
let first_commit_tstamp = first_commit.unwrap();
|
||||
assert!(first_commit_tstamp > add_tstamp);
|
||||
|
||||
// delete_all_documents the index
|
||||
let clear_tstamp = index_writer.delete_all_documents().unwrap();
|
||||
assert_eq!(clear_tstamp, add_tstamp);
|
||||
|
||||
// commit the clear command - now documents aren't available
|
||||
let second_commit = index_writer.commit();
|
||||
assert!(second_commit.is_ok());
|
||||
let second_commit_tstamp = second_commit.unwrap();
|
||||
|
||||
// add new documents again
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
|
||||
// rollback to last commit, when index was empty
|
||||
let rollback = index_writer.rollback();
|
||||
assert!(rollback.is_ok());
|
||||
let rollback_tstamp = rollback.unwrap();
|
||||
assert_eq!(rollback_tstamp, second_commit_tstamp);
|
||||
|
||||
// working with an empty index == no documents
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_then_add() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let res = index_writer.delete_all_documents();
|
||||
assert!(res.is_ok());
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
// expect the document with that term to be in the index
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_and_rollback() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
let comm = index_writer.commit();
|
||||
assert!(comm.is_ok());
|
||||
let commit_tstamp = comm.unwrap();
|
||||
|
||||
// clear but don't commit!
|
||||
let clear_tstamp = index_writer.delete_all_documents().unwrap();
|
||||
// clear_tstamp should reset to before the last commit
|
||||
assert!(clear_tstamp < commit_tstamp);
|
||||
|
||||
// rollback
|
||||
let _rollback_tstamp = index_writer.rollback().unwrap();
|
||||
// Find original docs in the index
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
// expect the document with that term to be in the index
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_empty_index() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
let commit = index_writer.commit();
|
||||
assert!(clear.is_ok());
|
||||
assert!(commit.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_index_twice() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
let commit = index_writer.commit();
|
||||
assert!(clear.is_ok());
|
||||
assert!(commit.is_ok());
|
||||
let clear_again = index_writer.delete_all_documents();
|
||||
let commit_again = index_writer.commit();
|
||||
assert!(clear_again.is_ok());
|
||||
assert!(commit_again.is_ok());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(|x| x.num_docs())
|
||||
.map(SegmentMeta::num_docs)
|
||||
.enumerate()
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use census::{Inventory, TrackedObject};
|
||||
use std::collections::HashSet;
|
||||
use Opstamp;
|
||||
use SegmentId;
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -17,8 +18,8 @@ impl MergeOperationInventory {
|
||||
}
|
||||
}
|
||||
|
||||
/// A `MergeOperation` has two role.
|
||||
/// It carries all of the information required to describe a merge :
|
||||
/// A `MergeOperation` has two roles.
|
||||
/// It carries all of the information required to describe a merge:
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||
/// delete queue and reflect their deletes.
|
||||
/// - `segment_ids` is the list of segment to be merged.
|
||||
@@ -35,14 +36,14 @@ pub struct MergeOperation {
|
||||
}
|
||||
|
||||
struct InnerMergeOperation {
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
}
|
||||
|
||||
impl MergeOperation {
|
||||
pub fn new(
|
||||
inventory: &MergeOperationInventory,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
) -> MergeOperation {
|
||||
let inner_merge_operation = InnerMergeOperation {
|
||||
@@ -54,7 +55,7 @@ impl MergeOperation {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn target_opstamp(&self) -> u64 {
|
||||
pub fn target_opstamp(&self) -> Opstamp {
|
||||
self.inner.target_opstamp
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use common::MAX_DOC_LIMIT;
|
||||
use core::Segment;
|
||||
use core::SegmentReader;
|
||||
use core::SerializableSegment;
|
||||
use docset::DocSet;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastFieldSerializer;
|
||||
@@ -23,6 +25,7 @@ use termdict::TermMerger;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
use Result;
|
||||
use TantivyError;
|
||||
|
||||
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
||||
let mut total_tokens = 0u64;
|
||||
@@ -70,7 +73,7 @@ fn compute_min_max_val(
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
||||
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
@@ -150,6 +153,14 @@ impl IndexMerger {
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
which exceeds the limit {}.",
|
||||
max_doc, MAX_DOC_LIMIT
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema,
|
||||
readers,
|
||||
@@ -229,7 +240,10 @@ impl IndexMerger {
|
||||
let mut max_value = u64::min_value();
|
||||
|
||||
for reader in &self.readers {
|
||||
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?;
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
if let Some((seg_min_val, seg_max_val)) =
|
||||
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
||||
{
|
||||
@@ -272,24 +286,28 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
let mut total_num_vals = 0u64;
|
||||
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
|
||||
|
||||
// In the first pass, we compute the total number of vals.
|
||||
//
|
||||
// This is required by the bitpacker, as it needs to know
|
||||
// what should be the bit length use for bitpacking.
|
||||
for reader in &self.readers {
|
||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
||||
let u64s_reader = reader.fast_fields()
|
||||
.u64s_lenient(field)
|
||||
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
let start = idx_reader.get(doc);
|
||||
let end = idx_reader.get(doc + 1);
|
||||
total_num_vals += end - start;
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = u64s_reader.num_vals(doc) as u64;
|
||||
total_num_vals += num_vals;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
total_num_vals += idx_reader.max_value();
|
||||
total_num_vals += u64s_reader.total_num_vals();
|
||||
}
|
||||
u64s_readers.push(u64s_reader);
|
||||
}
|
||||
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
@@ -297,13 +315,10 @@ impl IndexMerger {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
for reader in &self.readers {
|
||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
||||
for doc in reader.doc_ids_alive() {
|
||||
for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
let start = idx_reader.get(doc);
|
||||
let end = idx_reader.get(doc + 1);
|
||||
idx += end - start;
|
||||
idx += u64s_reader.num_vals(doc) as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
@@ -334,8 +349,10 @@ impl IndexMerger {
|
||||
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
||||
let term_ordinal_mapping: &[TermOrdinal] =
|
||||
term_ordinal_mappings.get_segment(segment_ord);
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
segment_reader.multi_fast_field_reader(field)?;
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
|
||||
.fast_fields()
|
||||
.u64s(field)
|
||||
.expect("Could not find multivalued u64 fast value reader.");
|
||||
// TODO optimize if no deletes
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -367,6 +384,8 @@ impl IndexMerger {
|
||||
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
|
||||
let mut ff_readers = Vec::new();
|
||||
|
||||
// Our values are bitpacked and we need to know what should be
|
||||
// our bitwidth and our minimum value before serializing any values.
|
||||
//
|
||||
@@ -375,7 +394,10 @@ impl IndexMerger {
|
||||
// maximum value and initialize our Serializer.
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
reader.multi_fast_field_reader(field)?;
|
||||
reader.fast_fields().u64s_lenient(field).expect(
|
||||
"Failed to find multivalued fast field reader. This is a bug in \
|
||||
tantivy. Please report.",
|
||||
);
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
for &val in &vals {
|
||||
@@ -383,6 +405,7 @@ impl IndexMerger {
|
||||
max_value = cmp::max(val, max_value);
|
||||
}
|
||||
}
|
||||
ff_readers.push(ff_reader);
|
||||
// TODO optimize when no deletes
|
||||
}
|
||||
|
||||
@@ -395,9 +418,7 @@ impl IndexMerger {
|
||||
{
|
||||
let mut serialize_vals = fast_field_serializer
|
||||
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
reader.multi_fast_field_reader(field)?;
|
||||
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
|
||||
// TODO optimize if no deletes
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -416,19 +437,53 @@ impl IndexMerger {
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
self.write_fast_field_idx(field, fast_field_serializer)?;
|
||||
let mut total_num_vals = 0u64;
|
||||
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
|
||||
|
||||
for reader in &self.readers {
|
||||
let bytes_reader = reader.fast_fields().bytes(field).expect(
|
||||
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
|
||||
);
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
|
||||
total_num_vals += num_vals;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
total_num_vals += bytes_reader.total_num_bytes() as u64;
|
||||
}
|
||||
bytes_readers.push(bytes_reader);
|
||||
}
|
||||
|
||||
{
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
// can effectively push the different indexes.
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
idx += bytes_reader.get_bytes(doc).len() as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
serialize_idx.close_field()?;
|
||||
}
|
||||
|
||||
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
||||
for reader in &self.readers {
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field)?;
|
||||
for segment_reader in &self.readers {
|
||||
let bytes_reader = segment_reader.fast_fields().bytes(field)
|
||||
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
|
||||
// TODO: optimize if no deletes
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let val = bytes_reader.get_val(doc);
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
let val = bytes_reader.get_bytes(doc);
|
||||
serialize_vals.write_all(val)?;
|
||||
}
|
||||
}
|
||||
serialize_vals.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -969,14 +1024,16 @@ mod tests {
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 4000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(1)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 1);
|
||||
assert_eq!(score_field_reader.max_value(), 3);
|
||||
@@ -1027,7 +1084,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1073,7 +1131,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1125,7 +1184,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 6000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1371,7 +1431,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
@@ -1406,7 +1466,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(1u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[28, 27]);
|
||||
|
||||
@@ -1416,7 +1476,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(2u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[20]);
|
||||
}
|
||||
@@ -1449,7 +1509,7 @@ mod tests {
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
use schema::Document;
|
||||
use schema::Term;
|
||||
use Opstamp;
|
||||
|
||||
/// Timestamped Delete operation.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct DeleteOperation {
|
||||
pub opstamp: u64,
|
||||
pub opstamp: Opstamp,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// Timestamped Add operation.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct AddOperation {
|
||||
pub opstamp: u64,
|
||||
pub opstamp: Opstamp,
|
||||
pub document: Document,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
use super::IndexWriter;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
payload: Option<String>,
|
||||
opstamp: u64,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
@@ -17,7 +18,7 @@ impl<'a> PreparedCommit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
pub fn opstamp(&self) -> Opstamp {
|
||||
self.opstamp
|
||||
}
|
||||
|
||||
@@ -25,11 +26,11 @@ impl<'a> PreparedCommit<'a> {
|
||||
self.payload = Some(payload.to_string())
|
||||
}
|
||||
|
||||
pub fn abort(self) -> Result<()> {
|
||||
pub fn abort(self) -> Result<Opstamp> {
|
||||
self.index_writer.rollback()
|
||||
}
|
||||
|
||||
pub fn commit(self) -> Result<u64> {
|
||||
pub fn commit(self) -> Result<Opstamp> {
|
||||
info!("committing {}", self.opstamp);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
|
||||
@@ -118,6 +118,12 @@ impl SegmentManager {
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) fn remove_all_segments(&self) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.committed.clear();
|
||||
registers_lock.uncommitted.clear();
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.committed.clear();
|
||||
|
||||
@@ -56,7 +56,7 @@ impl SegmentRegister {
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
segment_ids.sort_by_key(|meta| meta.id());
|
||||
segment_ids.sort_by_key(SegmentMeta::id);
|
||||
segment_ids
|
||||
}
|
||||
|
||||
|
||||
@@ -36,14 +36,15 @@ use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
// - it fails, in which case an error is returned,
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
@@ -69,6 +70,7 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
|
||||
info!("save metas");
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
@@ -212,6 +214,11 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
/// Orders `SegmentManager` to remove all segments
|
||||
pub(crate) fn remove_all_segments(&self) {
|
||||
self.0.segment_manager.remove_all_segments();
|
||||
}
|
||||
|
||||
pub fn kill(&mut self) {
|
||||
self.0.killed.store(true, Ordering::Release);
|
||||
}
|
||||
@@ -222,9 +229,9 @@ impl SegmentUpdater {
|
||||
|
||||
/// Apply deletes up to the target opstamp to all segments.
|
||||
///
|
||||
/// Tne method returns copies of the segment entries,
|
||||
/// The method returns copies of the segment entries,
|
||||
/// updated with the delete information.
|
||||
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
|
||||
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
|
||||
let mut segment_entries = self.0.segment_manager.segment_entries();
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = self.0.index.segment(segment_entry.meta().clone());
|
||||
@@ -233,7 +240,7 @@ impl SegmentUpdater {
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
|
||||
pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
@@ -280,7 +287,7 @@ impl SegmentUpdater {
|
||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
||||
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
@@ -420,6 +427,7 @@ impl SegmentUpdater {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||
|
||||
for merge_operation in merge_candidates {
|
||||
match self.start_merge_impl(merge_operation) {
|
||||
Ok(merge_future) => {
|
||||
@@ -444,38 +452,41 @@ impl SegmentUpdater {
|
||||
) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) =
|
||||
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
|
||||
{
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
e
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
{
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) = advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
e
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
return;
|
||||
}
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
segment_updater
|
||||
.0
|
||||
.segment_manager
|
||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater
|
||||
.0
|
||||
.segment_manager
|
||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
} // we drop all possible handle to a now useless `SegmentMeta`.
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
@@ -649,4 +660,31 @@ mod tests {
|
||||
assert!(index.searchable_segment_metas().unwrap().is_empty());
|
||||
assert!(reader.searcher().segment_readers().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_all_segments() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index_writer.segment_updater().remove_all_segments();
|
||||
let seg_vec = index_writer
|
||||
.segment_updater()
|
||||
.0
|
||||
.segment_manager
|
||||
.segment_entries();
|
||||
assert!(seg_vec.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
@@ -15,10 +16,11 @@ use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use DocId;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
/// set of documents.
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
@@ -28,7 +30,7 @@ pub struct SegmentWriter {
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
@@ -53,7 +55,7 @@ impl SegmentWriter {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| field_entry.field_type())
|
||||
.map(FieldEntry::field_type)
|
||||
.map(|field_type| match *field_type {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
|
||||
@@ -1,76 +1,39 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use Opstamp;
|
||||
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(AtomicUsize);
|
||||
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||
self.0.fetch_add(val as usize, order) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
/// Under other architecture, we rely on a mutex.
|
||||
use std::sync::RwLock;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(RwLock<u64>);
|
||||
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||
let mut lock = self.0.write().unwrap();
|
||||
let previous_val = *lock;
|
||||
*lock = previous_val + incr;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
use self::archicture_impl::AtomicU64Ersatz;
|
||||
|
||||
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
||||
/// an operation.
|
||||
///
|
||||
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
||||
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
pub fn stamp(&self) -> Opstamp {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
|
||||
/// Given a desired count `n`, `stamps` returns an iterator that
|
||||
/// will supply `n` number of u64 stamps.
|
||||
pub fn stamps(&self, n: u64) -> Range<u64> {
|
||||
pub fn stamps(&self, n: u64) -> Range<Opstamp> {
|
||||
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
||||
Range {
|
||||
start,
|
||||
end: start + n,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverts the stamper to a given `Opstamp` value and returns it
|
||||
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
|
||||
self.0.store(to_opstamp, Ordering::SeqCst);
|
||||
to_opstamp
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -92,4 +55,18 @@ mod test {
|
||||
assert_eq!(stamper.stamps(3u64), (12..15));
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
assert_eq!(stamper.stamp(), 7u64);
|
||||
assert_eq!(stamper.stamp(), 8u64);
|
||||
|
||||
let stamper_clone = stamper.clone();
|
||||
assert_eq!(stamper_clone.stamp(), 9u64);
|
||||
|
||||
stamper.revert(6);
|
||||
assert_eq!(stamper.stamp(), 6);
|
||||
assert_eq!(stamper_clone.stamp(), 7);
|
||||
}
|
||||
}
|
||||
|
||||
37
src/lib.rs
37
src/lib.rs
@@ -174,6 +174,7 @@ extern crate downcast_rs;
|
||||
#[macro_use]
|
||||
extern crate fail;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
@@ -225,7 +226,7 @@ mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use core::SegmentComponent;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use directory::Directory;
|
||||
pub use indexer::IndexWriter;
|
||||
@@ -253,6 +254,16 @@ pub mod merge_policy {
|
||||
/// as they are added in the segment.
|
||||
pub type DocId = u32;
|
||||
|
||||
/// A u64 assigned to every operation incrementally
|
||||
///
|
||||
/// All operations modifying the index receives an monotonic Opstamp.
|
||||
/// The resulting state of the index is consistent with the opstamp ordering.
|
||||
///
|
||||
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
|
||||
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
|
||||
/// with opstamp `n+1`.
|
||||
pub type Opstamp = u64;
|
||||
|
||||
/// A f32 that represents the relevance of the document to the query
|
||||
///
|
||||
/// This is modelled internally as a `f32`. The
|
||||
@@ -875,28 +886,28 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
}
|
||||
|
||||
249
src/postings/block_search.rs
Normal file
249
src/postings/block_search.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
use postings::compression::AlignedBuffer;
|
||||
|
||||
/// This modules define the logic used to search for a doc in a given
|
||||
/// block. (at most 128 docs)
|
||||
///
|
||||
/// Searching within a block is a hotspot when running intersection.
|
||||
/// so it was worth defining it in its own module.
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod sse2 {
|
||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
use std::arch::x86_64::__m128i as DataType;
|
||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
||||
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
|
||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
||||
|
||||
const MASK1: i32 = 78;
|
||||
const MASK2: i32 = 177;
|
||||
|
||||
/// Performs an exhaustive linear search over the
|
||||
///
|
||||
/// There is no early exit here. We simply count the
|
||||
/// number of elements that are `< target`.
|
||||
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
unsafe {
|
||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||
let vkey = set1(target as i32);
|
||||
let mut cnt = set0();
|
||||
// We work over 4 `__m128i` at a time.
|
||||
// A single `__m128i` actual contains 4 `u32`.
|
||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
||||
cnt = op_sub(cnt, sum);
|
||||
}
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
||||
_mm_cvtsi128_si32(cnt) as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::linear_search_sse2_128;
|
||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search_sse2_128_u32() {
|
||||
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
for el in 0u32..128u32 {
|
||||
block[el as usize] = el * 2 + 1 << 18;
|
||||
}
|
||||
let target = block[64] + 1;
|
||||
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This `linear search` browser exhaustively through the array.
|
||||
/// but the early exit is very difficult to predict.
|
||||
///
|
||||
/// Coupled with `exponential search` this function is likely
|
||||
/// to be called with the same `len`
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(&block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
}
|
||||
|
||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
||||
/// a given block.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub enum BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
SSE2,
|
||||
Scalar,
|
||||
}
|
||||
|
||||
impl BlockSearcher {
|
||||
/// Search the first index containing an element greater or equal to
|
||||
/// the target.
|
||||
///
|
||||
/// The results should be equivalent to
|
||||
/// ```ignore
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
// .count()
|
||||
/// ```
|
||||
///
|
||||
/// The `start` argument is just used to hint that the response is
|
||||
/// greater than beyond `start`. The implementation may or may not use
|
||||
/// it for optimization.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array len is > start.
|
||||
/// The block is sorted
|
||||
/// The target is assumed greater or equal to the `arr[start]`.
|
||||
/// The target is assumed smaller or equal to the last element of the block.
|
||||
///
|
||||
/// Currently the scalar implementation starts by an exponential search, and
|
||||
/// then operates a linear search in the result subarray.
|
||||
///
|
||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
||||
/// then we use a different implementation that does an exhaustive linear search over
|
||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
||||
/// of branch.
|
||||
pub(crate) fn search_in_block(
|
||||
self,
|
||||
block_docs: &AlignedBuffer,
|
||||
len: usize,
|
||||
start: usize,
|
||||
target: u32,
|
||||
) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
start + galloping(&block_docs.0[start..len], target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BlockSearcher {
|
||||
fn default() -> BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
return BlockSearcher::SSE2;
|
||||
}
|
||||
}
|
||||
BlockSearcher::Scalar
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::BlockSearcher;
|
||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
for i in 0..cursor {
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(
|
||||
&AlignedBuffer(output_buffer),
|
||||
block.len(),
|
||||
i,
|
||||
target
|
||||
),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_in_block(block_searcher, block, target);
|
||||
}
|
||||
}
|
||||
|
||||
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block.iter().take_while(|&&val| val < target).count()
|
||||
}
|
||||
|
||||
fn test_search_in_block_util(block_searcher: BlockSearcher) {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_in_block_all(block_searcher, &v[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_in_block_scalar() {
|
||||
test_search_in_block_util(BlockSearcher::Scalar);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[test]
|
||||
fn test_search_in_block_sse2() {
|
||||
test_search_in_block_util(BlockSearcher::SSE2);
|
||||
}
|
||||
}
|
||||
@@ -43,9 +43,14 @@ impl BlockEncoder {
|
||||
}
|
||||
}
|
||||
|
||||
/// We ensure that the OutputBuffer is align on 128 bits
|
||||
/// in order to run SSE2 linear search on it.
|
||||
#[repr(align(128))]
|
||||
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||
output: AlignedBuffer,
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
@@ -55,11 +60,9 @@ impl BlockDecoder {
|
||||
}
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
|
||||
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
||||
BlockDecoder {
|
||||
bitpacker: BitPacker4x::new(),
|
||||
output,
|
||||
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
@@ -72,23 +75,28 @@ impl BlockDecoder {
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress(&compressed_data, &mut self.output, num_bits)
|
||||
.decompress(&compressed_data, &mut self.output.0, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
&self.output.0[..self.output_len]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
(&self.output, self.output_len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
self.output.0[idx]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,12 +167,12 @@ impl VIntDecoder for BlockDecoder {
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
@@ -16,6 +17,8 @@ mod skip;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use self::block_search::BlockSearcher;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
@@ -52,13 +55,15 @@ pub mod tests {
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use merge_policy::NoMergePolicy;
|
||||
use query::Scorer;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||
use schema::{Field, TextOptions};
|
||||
use schema::{IndexRecordOption, TextFieldIndexing};
|
||||
use std::iter;
|
||||
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -104,9 +109,7 @@ pub mod tests {
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||
let term = Term::from_field_text(title, "abc");
|
||||
|
||||
let mut positions = Vec::new();
|
||||
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -159,6 +162,52 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() {
|
||||
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
|
||||
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
|
||||
exceeding_token_text.push_str(" hello");
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("simple_no_truncation"),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes, b"hello");
|
||||
}
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(1u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut positions = Vec::new();
|
||||
|
||||
@@ -12,8 +12,8 @@ use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use termdict::TermOrdinal;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::{Token, MAX_TOKEN_LEN};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
@@ -210,8 +210,18 @@ pub trait PostingsWriter {
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
} else {
|
||||
info!(
|
||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||
MAX_TOKEN_LEN in the documentation for more information.",
|
||||
token.text.len(),
|
||||
MAX_TOKEN_LEN
|
||||
);
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
|
||||
@@ -4,9 +4,10 @@ use common::{BinarySerializable, VInt};
|
||||
use docset::{DocSet, SkipResult};
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::compression::compressed_block_size;
|
||||
use postings::compression::{compressed_block_size, AlignedBuffer};
|
||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use postings::BlockSearcher;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
use postings::SkipReader;
|
||||
@@ -60,6 +61,7 @@ pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
position_computer: Option<PositionComputer>,
|
||||
block_searcher: BlockSearcher,
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
@@ -70,6 +72,7 @@ impl SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,42 +120,33 @@ impl SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
/// Search the first index containing an element greater or equal to the target.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array is assumed non empty.
|
||||
/// The target is assumed greater or equal to the first element.
|
||||
/// The target is assumed smaller or equal to the last element.
|
||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
}
|
||||
|
||||
impl DocSet for SegmentPostings {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
position_computer.add_skip(term_freq);
|
||||
}
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
@@ -175,7 +169,6 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// skip blocks until one that might contain the target
|
||||
// check if we need to go to the next block
|
||||
let need_positions = self.position_computer.is_some();
|
||||
let mut sum_freqs_skipped: u32 = 0;
|
||||
if !self
|
||||
.block_cursor
|
||||
@@ -189,7 +182,7 @@ impl DocSet for SegmentPostings {
|
||||
// we are not in the right block.
|
||||
//
|
||||
// First compute all of the freqs skipped from the current block.
|
||||
if need_positions {
|
||||
if self.position_computer.is_some() {
|
||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||
match self.block_cursor.skip_to(target) {
|
||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||
@@ -208,25 +201,21 @@ impl DocSet for SegmentPostings {
|
||||
self.cur = 0;
|
||||
}
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
let cur = self.cur;
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
.sum::<u32>();
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs_skipped as usize);
|
||||
// we're in the right block now, start with an exponential search
|
||||
let (output, len) = self.block_cursor.docs_aligned();
|
||||
let new_cur = self
|
||||
.block_searcher
|
||||
.search_in_block(&output, len, cur, target);
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
|
||||
position_computer.add_skip(sum_freqs_skipped as usize);
|
||||
}
|
||||
self.cur = new_cur;
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = block_docs[new_cur];
|
||||
let doc = output.0[new_cur];
|
||||
debug_assert!(doc >= target);
|
||||
if doc == target {
|
||||
SkipResult::Reached
|
||||
@@ -235,40 +224,25 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
let docs = self.block_cursor.docs();
|
||||
debug_assert!(
|
||||
self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
||||
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
|
||||
);
|
||||
docs[self.cur]
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
// finish the current block
|
||||
if self.advance() {
|
||||
@@ -292,17 +266,33 @@ impl HasLen for SegmentPostings {
|
||||
}
|
||||
|
||||
impl Postings for SegmentPostings {
|
||||
/// Returns the frequency associated to the current document.
|
||||
/// If the schema is set up so that no frequency have been encoded,
|
||||
/// this method should always return 1.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
fn term_freq(&self) -> u32 {
|
||||
debug_assert!(
|
||||
// Here we do not use the len of `freqs()`
|
||||
// because it is actually ok to request for the freq of doc
|
||||
// even if no frequency were encoded for the field.
|
||||
//
|
||||
// In that case we hit the block just as if the frequency had been
|
||||
// decoded. The block is simply prefilled by the value 1.
|
||||
self.cur < COMPRESSION_BLOCK_SIZE,
|
||||
"Have you forgotten to call `.advance()` at least once before calling \
|
||||
`.term_freq()`."
|
||||
);
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
if self.position_computer.is_some() {
|
||||
output.resize(self.term_freq() as usize, 0u32);
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.positions_with_offset(offset, &mut output[..])
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_comp) = self.position_computer.as_mut() {
|
||||
output.resize(term_freq, 0u32);
|
||||
position_comp.positions_with_offset(offset, &mut output[..]);
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
@@ -424,6 +414,10 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
@@ -614,16 +608,13 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
use super::SegmentPostings;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
use docset::DocSet;
|
||||
use postings::postings::Postings;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
@@ -632,21 +623,6 @@ mod tests {
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
@@ -655,6 +631,18 @@ mod tests {
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_doc_called_before_advance() {
|
||||
SegmentPostings::empty().doc();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_freq_called_before_advance() {
|
||||
SegmentPostings::empty().term_freq();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_block_segment_postings() {
|
||||
let mut postings = BlockSegmentPostings::empty();
|
||||
@@ -662,56 +650,6 @@ mod tests {
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.filter(|&(_, ref val)| *val >= target)
|
||||
.next()
|
||||
.unwrap()
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
search_within_block_trivial_but_slow(block, target)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block_all(block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_within_block(block, target);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_within_block() {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_within_block_all(&v[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
|
||||
@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// `InvertedIndexSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
/// * `.idx` (inverted index)
|
||||
/// * `.pos` (positions file)
|
||||
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
|
||||
}
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
/// Open a new `InvertedIndexSerializer` for the given segment
|
||||
fn create(
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
let positions_idx = self
|
||||
.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
||||
.map(PositionSerializer::positions_idx)
|
||||
.unwrap_or(0u64);
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::{Query, Scorer, Weight};
|
||||
use query::explanation::does_not_match;
|
||||
use query::{Explanation, Query, Scorer, Weight};
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -29,6 +30,13 @@ impl Weight for AllWeight {
|
||||
max_doc: reader.max_doc(),
|
||||
}))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
if doc >= reader.max_doc() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
Ok(Explanation::new("AllQuery", 1f32))
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
use common::BitSet;
|
||||
use core::SegmentReader;
|
||||
use query::BitSetDocSet;
|
||||
use query::ConstScorer;
|
||||
use query::{BitSetDocSet, Explanation};
|
||||
use query::{Scorer, Weight};
|
||||
use schema::{Field, IndexRecordOption};
|
||||
use tantivy_fst::Automaton;
|
||||
use termdict::{TermDictionary, TermStreamer};
|
||||
use Result;
|
||||
use DocId;
|
||||
use TantivyError;
|
||||
use {Result, SkipResult};
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A>
|
||||
@@ -56,4 +58,15 @@ where
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(doc_bitset)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
if scorer.skip_next(doc) == SkipResult::Reached {
|
||||
Ok(Explanation::new("AutomatonScorer", 1.0f32))
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
"Document does not exist".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use fieldnorm::FieldNormReader;
|
||||
use query::Explanation;
|
||||
use Score;
|
||||
use Searcher;
|
||||
use Term;
|
||||
@@ -26,18 +27,13 @@ fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BM25Weight {
|
||||
idf_explain: Explanation,
|
||||
weight: f32,
|
||||
cache: [f32; 256],
|
||||
average_fieldnorm: f32,
|
||||
}
|
||||
|
||||
impl BM25Weight {
|
||||
pub fn null() -> BM25Weight {
|
||||
BM25Weight {
|
||||
weight: 0f32,
|
||||
cache: [1f32; 256],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
|
||||
assert!(!terms.is_empty(), "BM25 requires at least one term");
|
||||
let field = terms[0].field();
|
||||
@@ -58,20 +54,37 @@ impl BM25Weight {
|
||||
}
|
||||
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
|
||||
|
||||
let idf = terms
|
||||
.iter()
|
||||
.map(|term| {
|
||||
let term_doc_freq = searcher.doc_freq(term);
|
||||
idf(term_doc_freq, total_num_docs)
|
||||
})
|
||||
.sum::<f32>();
|
||||
BM25Weight::new(idf, average_fieldnorm)
|
||||
let mut idf_explain: Explanation;
|
||||
if terms.len() == 1 {
|
||||
let term_doc_freq = searcher.doc_freq(&terms[0]);
|
||||
let idf = idf(term_doc_freq, total_num_docs);
|
||||
idf_explain =
|
||||
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
|
||||
idf_explain.add_const(
|
||||
"n, number of docs containing this term",
|
||||
term_doc_freq as f32,
|
||||
);
|
||||
idf_explain.add_const("N, total number of docs", total_num_docs as f32);
|
||||
} else {
|
||||
let idf = terms
|
||||
.iter()
|
||||
.map(|term| {
|
||||
let term_doc_freq = searcher.doc_freq(term);
|
||||
idf(term_doc_freq, total_num_docs)
|
||||
})
|
||||
.sum::<f32>();
|
||||
idf_explain = Explanation::new("idf", idf);
|
||||
}
|
||||
BM25Weight::new(idf_explain, average_fieldnorm)
|
||||
}
|
||||
|
||||
fn new(idf: f32, average_fieldnorm: f32) -> BM25Weight {
|
||||
fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight {
|
||||
let weight = idf_explain.value() * (1f32 + K1);
|
||||
BM25Weight {
|
||||
weight: idf * (1f32 + K1),
|
||||
idf_explain,
|
||||
weight,
|
||||
cache: compute_tf_cache(average_fieldnorm),
|
||||
average_fieldnorm,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,6 +94,37 @@ impl BM25Weight {
|
||||
let term_freq = term_freq as f32;
|
||||
self.weight * term_freq / (term_freq + norm)
|
||||
}
|
||||
|
||||
pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation {
|
||||
// The explain format is directly copied from Lucene's.
|
||||
// (So, Kudos to Lucene)
|
||||
|
||||
let score = self.score(fieldnorm_id, term_freq);
|
||||
|
||||
let norm = self.cache[fieldnorm_id as usize];
|
||||
let term_freq = term_freq as f32;
|
||||
let right_factor = term_freq / (term_freq + norm);
|
||||
|
||||
let mut tf_explanation = Explanation::new(
|
||||
"freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||||
right_factor,
|
||||
);
|
||||
|
||||
tf_explanation.add_const("freq, occurrences of term within document", term_freq);
|
||||
tf_explanation.add_const("k1, term saturation parameter", K1);
|
||||
tf_explanation.add_const("b, length normalization parameter", B);
|
||||
tf_explanation.add_const(
|
||||
"dl, length of field",
|
||||
FieldNormReader::id_to_fieldnorm(fieldnorm_id) as f32,
|
||||
);
|
||||
tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm);
|
||||
|
||||
let mut explanation = Explanation::new("TermQuery, product of...", score);
|
||||
explanation.add_detail(Explanation::new("(K1+1)", K1 + 1f32));
|
||||
explanation.add_detail(self.idf_explain.clone());
|
||||
explanation.add_detail(tf_explanation);
|
||||
explanation
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use core::SegmentReader;
|
||||
use query::intersect_scorers;
|
||||
use query::explanation::does_not_match;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use query::term_query::TermScorer;
|
||||
use query::EmptyScorer;
|
||||
@@ -9,8 +9,10 @@ use query::RequiredOptionalScorer;
|
||||
use query::Scorer;
|
||||
use query::Union;
|
||||
use query::Weight;
|
||||
use query::{intersect_scorers, Explanation};
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
use {DocId, SkipResult};
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
|
||||
where
|
||||
@@ -50,10 +52,10 @@ impl BooleanWeight {
|
||||
}
|
||||
}
|
||||
|
||||
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
fn per_occur_scorers(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<Scorer>> {
|
||||
) -> Result<HashMap<Occur, Vec<Box<Scorer>>>> {
|
||||
let mut per_occur_scorers: HashMap<Occur, Vec<Box<Scorer>>> = HashMap::new();
|
||||
for &(ref occur, ref subweight) in &self.weights {
|
||||
let sub_scorer: Box<Scorer> = subweight.scorer(reader)?;
|
||||
@@ -62,6 +64,14 @@ impl BooleanWeight {
|
||||
.or_insert_with(Vec::new)
|
||||
.push(sub_scorer);
|
||||
}
|
||||
Ok(per_occur_scorers)
|
||||
}
|
||||
|
||||
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<Scorer>> {
|
||||
let mut per_occur_scorers = self.per_occur_scorers(reader)?;
|
||||
|
||||
let should_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Should)
|
||||
@@ -118,4 +128,31 @@ impl Weight for BooleanWeight {
|
||||
self.complex_scorer::<DoNothingCombiner>(reader)
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
if !self.scoring_enabled {
|
||||
return Ok(Explanation::new("BooleanQuery with no scoring", 1f32));
|
||||
}
|
||||
|
||||
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
|
||||
for &(ref occur, ref subweight) in &self.weights {
|
||||
if is_positive_occur(*occur) {
|
||||
if let Ok(child_explanation) = subweight.explain(reader, doc) {
|
||||
explanation.add_detail(child_explanation);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_positive_occur(occur: Occur) -> bool {
|
||||
match occur {
|
||||
Occur::Must | Occur::Should => true,
|
||||
Occur::MustNot => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,8 +18,8 @@ mod tests {
|
||||
use query::Scorer;
|
||||
use query::TermQuery;
|
||||
use schema::*;
|
||||
use DocId;
|
||||
use Index;
|
||||
use {DocAddress, DocId};
|
||||
|
||||
fn aux_test_helper() -> (Index, Field) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -205,4 +205,167 @@ mod tests {
|
||||
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
|
||||
}
|
||||
}
|
||||
|
||||
// motivated by #554
|
||||
#[test]
|
||||
fn test_bm25_several_fields() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
// tf = 1 0
|
||||
title => "Законы притяжения Оксана Кулакова",
|
||||
// tf = 1 0
|
||||
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
// tf = 1 0
|
||||
title => "Любимые русские пироги (Оксана Путан)",
|
||||
// tf = 2 0
|
||||
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
// tf = 1 1
|
||||
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
|
||||
// tf = 0 0
|
||||
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
|
||||
));
|
||||
for _ in 0..1_000 {
|
||||
index_writer.add_document(doc!(
|
||||
title => "a b d e f g",
|
||||
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
|
||||
));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, text]);
|
||||
let query = query_parser
|
||||
.parse_query("Оксана Лифенко")
|
||||
.unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
scorer.advance();
|
||||
|
||||
let explanation = query.explain(&searcher, DocAddress(0u32, 0u32)).unwrap();
|
||||
assert_eq!(
|
||||
explanation.to_pretty_json(),
|
||||
r#"{
|
||||
"value": 12.997711,
|
||||
"description": "BooleanClause. Sum of ...",
|
||||
"details": [
|
||||
{
|
||||
"value": 12.997711,
|
||||
"description": "BooleanClause. Sum of ...",
|
||||
"details": [
|
||||
{
|
||||
"value": 6.551476,
|
||||
"description": "TermQuery, product of...",
|
||||
"details": [
|
||||
{
|
||||
"value": 2.2,
|
||||
"description": "(K1+1)"
|
||||
},
|
||||
{
|
||||
"value": 5.658984,
|
||||
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
|
||||
"details": [
|
||||
{
|
||||
"value": 3.0,
|
||||
"description": "n, number of docs containing this term"
|
||||
},
|
||||
{
|
||||
"value": 1003.0,
|
||||
"description": "N, total number of docs"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"value": 0.5262329,
|
||||
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||||
"details": [
|
||||
{
|
||||
"value": 1.0,
|
||||
"description": "freq, occurrences of term within document"
|
||||
},
|
||||
{
|
||||
"value": 1.2,
|
||||
"description": "k1, term saturation parameter"
|
||||
},
|
||||
{
|
||||
"value": 0.75,
|
||||
"description": "b, length normalization parameter"
|
||||
},
|
||||
{
|
||||
"value": 4.0,
|
||||
"description": "dl, length of field"
|
||||
},
|
||||
{
|
||||
"value": 5.997009,
|
||||
"description": "avgdl, average length of field"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"value": 6.446235,
|
||||
"description": "TermQuery, product of...",
|
||||
"details": [
|
||||
{
|
||||
"value": 2.2,
|
||||
"description": "(K1+1)"
|
||||
},
|
||||
{
|
||||
"value": 5.9954567,
|
||||
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
|
||||
"details": [
|
||||
{
|
||||
"value": 2.0,
|
||||
"description": "n, number of docs containing this term"
|
||||
},
|
||||
{
|
||||
"value": 1003.0,
|
||||
"description": "N, total number of docs"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"value": 0.4887212,
|
||||
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||||
"details": [
|
||||
{
|
||||
"value": 1.0,
|
||||
"description": "freq, occurrences of term within document"
|
||||
},
|
||||
{
|
||||
"value": 1.2,
|
||||
"description": "k1, term saturation parameter"
|
||||
},
|
||||
{
|
||||
"value": 0.75,
|
||||
"description": "b, length normalization parameter"
|
||||
},
|
||||
{
|
||||
"value": 20.0,
|
||||
"description": "dl, length of field"
|
||||
},
|
||||
{
|
||||
"value": 24.123629,
|
||||
"description": "avgdl, average length of field"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::Scorer;
|
||||
use query::Query;
|
||||
use query::explanation::does_not_match;
|
||||
use query::Weight;
|
||||
use query::{Explanation, Query};
|
||||
use DocId;
|
||||
use DocSet;
|
||||
use Result;
|
||||
@@ -32,6 +33,10 @@ impl Weight for EmptyWeight {
|
||||
fn scorer(&self, _reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
|
||||
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
Err(does_not_match(doc))
|
||||
}
|
||||
}
|
||||
|
||||
/// `EmptyScorer` is a dummy `Scorer` in which no document matches.
|
||||
|
||||
51
src/query/explanation.rs
Normal file
51
src/query/explanation.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use {DocId, TantivyError};
|
||||
|
||||
pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
|
||||
}
|
||||
|
||||
/// Object describing the score of a given document.
|
||||
/// It is organized in trees.
|
||||
///
|
||||
/// `.to_pretty_json()` can be useful to print out a human readable
|
||||
/// representation of this tree when debugging a given score.
|
||||
#[derive(Clone, Serialize)]
|
||||
pub struct Explanation {
|
||||
value: f32,
|
||||
description: String,
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
details: Vec<Explanation>,
|
||||
}
|
||||
|
||||
impl Explanation {
|
||||
/// Creates a new explanation object.
|
||||
pub fn new<T: ToString>(description: T, value: f32) -> Explanation {
|
||||
Explanation {
|
||||
value,
|
||||
description: description.to_string(),
|
||||
details: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value associated to the current node.
|
||||
pub fn value(&self) -> f32 {
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Add some detail, explaining some part of the current node formula.
|
||||
///
|
||||
/// Details are treated as child of the current node.
|
||||
pub fn add_detail(&mut self, child_explanation: Explanation) {
|
||||
self.details.push(child_explanation);
|
||||
}
|
||||
|
||||
/// Shortcut for `self.details.push(Explanation::new(name, value));`
|
||||
pub fn add_const<T: ToString>(&mut self, name: T, value: f32) {
|
||||
self.details.push(Explanation::new(name, value));
|
||||
}
|
||||
|
||||
/// Returns an indented json representation of the explanation tree for debug usage.
|
||||
pub fn to_pretty_json(&self) -> String {
|
||||
serde_json::to_string_pretty(self).unwrap()
|
||||
}
|
||||
}
|
||||
@@ -14,41 +14,35 @@ use Score;
|
||||
/// specialized implementation if the two
|
||||
/// shortest scorers are `TermScorer`s.
|
||||
pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
if scorers.is_empty() {
|
||||
return Box::new(EmptyScorer);
|
||||
}
|
||||
if scorers.len() == 1 {
|
||||
return scorers.pop().unwrap();
|
||||
}
|
||||
// We know that we have at least 2 elements.
|
||||
let num_docsets = scorers.len();
|
||||
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
|
||||
let rarest_opt = scorers.pop();
|
||||
let second_rarest_opt = scorers.pop();
|
||||
let left = scorers.pop().unwrap();
|
||||
let right = scorers.pop().unwrap();
|
||||
scorers.reverse();
|
||||
match (rarest_opt, second_rarest_opt) {
|
||||
(None, None) => Box::new(EmptyScorer),
|
||||
(Some(single_docset), None) => single_docset,
|
||||
(Some(left), Some(right)) => {
|
||||
{
|
||||
let all_term_scorers = [&left, &right]
|
||||
.iter()
|
||||
.all(|&scorer| scorer.is::<TermScorer>());
|
||||
if all_term_scorers {
|
||||
let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||
let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||
return Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
})
|
||||
}
|
||||
_ => {
|
||||
unreachable!();
|
||||
}
|
||||
let all_term_scorers = [&left, &right]
|
||||
.iter()
|
||||
.all(|&scorer| scorer.is::<TermScorer>());
|
||||
if all_term_scorers {
|
||||
return Box::new(Intersection {
|
||||
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
@@ -124,7 +118,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
break;
|
||||
@@ -140,35 +133,36 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
}
|
||||
// test the remaining scorers;
|
||||
for (ord, docset) in self.others.iter_mut().enumerate() {
|
||||
if ord != other_candidate_ord {
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate = docset.doc();
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
other_candidate_ord = ord;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
if ord == other_candidate_ord {
|
||||
continue;
|
||||
}
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate = docset.doc();
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
other_candidate_ord = ord;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ mod bm25;
|
||||
mod boolean_query;
|
||||
mod empty_query;
|
||||
mod exclude;
|
||||
mod explanation;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod occur;
|
||||
@@ -39,6 +40,7 @@ pub use self::bitset::BitSetDocSet;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::occur::Occur;
|
||||
|
||||
@@ -4,6 +4,7 @@ use error::TantivyError;
|
||||
use query::bm25::BM25Weight;
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, Term};
|
||||
use std::collections::BTreeSet;
|
||||
use Result;
|
||||
@@ -83,7 +84,7 @@ impl Query for PhraseQuery {
|
||||
let has_positions = field_entry
|
||||
.field_type()
|
||||
.get_index_record_option()
|
||||
.map(|index_record_option| index_record_option.has_positions())
|
||||
.map(IndexRecordOption::has_positions)
|
||||
.unwrap_or(false);
|
||||
if !has_positions {
|
||||
let field_name = field_entry.name();
|
||||
@@ -92,21 +93,12 @@ impl Query for PhraseQuery {
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
if scoring_enabled {
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
|
||||
Ok(Box::new(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight,
|
||||
true,
|
||||
)))
|
||||
} else {
|
||||
Ok(Box::new(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
BM25Weight::null(),
|
||||
false,
|
||||
)))
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
|
||||
|
||||
let phrase_weight: PhraseWeight =
|
||||
PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
|
||||
@@ -148,9 +148,13 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn phrase_count(&self) -> u32 {
|
||||
self.phrase_count
|
||||
}
|
||||
|
||||
fn phrase_match(&mut self) -> bool {
|
||||
if self.score_needed {
|
||||
let count = self.phrase_count();
|
||||
let count = self.compute_phrase_count();
|
||||
self.phrase_count = count;
|
||||
count > 0u32
|
||||
} else {
|
||||
@@ -183,7 +187,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
|
||||
fn phrase_count(&mut self) -> u32 {
|
||||
fn compute_phrase_count(&mut self) -> u32 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
use super::PhraseScorer;
|
||||
use core::SegmentReader;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use postings::SegmentPostings;
|
||||
use query::bm25::BM25Weight;
|
||||
use query::EmptyScorer;
|
||||
use query::explanation::does_not_match;
|
||||
use query::Scorer;
|
||||
use query::Weight;
|
||||
use query::{EmptyScorer, Explanation};
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use Result;
|
||||
use {DocId, DocSet};
|
||||
use {Result, SkipResult};
|
||||
|
||||
pub struct PhraseWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
@@ -27,13 +31,18 @@ impl PhraseWeight {
|
||||
score_needed,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
|
||||
reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn phrase_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader);
|
||||
if reader.has_deletes() {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
@@ -43,10 +52,10 @@ impl Weight for PhraseWeight {
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
Ok(Box::new(PhraseScorer::new(
|
||||
Ok(Some(PhraseScorer::new(
|
||||
term_postings_list,
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
@@ -61,10 +70,10 @@ impl Weight for PhraseWeight {
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
Ok(Box::new(PhraseScorer::new(
|
||||
Ok(Some(PhraseScorer::new(
|
||||
term_postings_list,
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
@@ -73,3 +82,30 @@ impl Weight for PhraseWeight {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
if let Some(scorer) = self.phrase_scorer(reader)? {
|
||||
Ok(Box::new(scorer))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let scorer_opt = self.phrase_scorer(reader)?;
|
||||
if scorer_opt.is_none() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut scorer = scorer_opt.unwrap();
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader);
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
let phrase_count = scorer.phrase_count();
|
||||
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
|
||||
explanation.add_detail(self.similarity_weight.explain(fieldnorm_id, phrase_count));
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use super::Weight;
|
||||
use core::searcher::Searcher;
|
||||
use downcast_rs;
|
||||
use query::Explanation;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
use Term;
|
||||
use {downcast_rs, DocAddress};
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
/// for those documents.
|
||||
@@ -48,6 +49,13 @@ pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
|
||||
/// See [`Weight`](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>>;
|
||||
|
||||
/// Returns an `Explanation` for the score of the document.
|
||||
fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> Result<Explanation> {
|
||||
let reader = searcher.segment_reader(doc_address.segment_ord());
|
||||
let weight = self.weight(searcher, true)?;
|
||||
weight.explain(reader, doc_address.doc())
|
||||
}
|
||||
|
||||
/// Returns the number of documents matching the query.
|
||||
fn count(&self, searcher: &Searcher) -> Result<usize> {
|
||||
let weight = self.weight(searcher, false)?;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))]
|
||||
|
||||
use super::query_grammar;
|
||||
use super::user_input_ast::*;
|
||||
use combine::char::*;
|
||||
use combine::error::StreamError;
|
||||
@@ -22,7 +23,7 @@ parser! {
|
||||
parser! {
|
||||
fn word[I]()(I) -> String
|
||||
where [I: Stream<Item = char>] {
|
||||
many1(satisfy(|c: char| c.is_alphanumeric()))
|
||||
many1(satisfy(char::is_alphanumeric))
|
||||
.and_then(|s: String| {
|
||||
match s.as_str() {
|
||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||
@@ -62,7 +63,7 @@ parser! {
|
||||
fn negative_number[I]()(I) -> String
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
(char('-'), many1(satisfy(char::is_numeric)))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
}
|
||||
@@ -184,7 +185,7 @@ parser! {
|
||||
}
|
||||
)
|
||||
)
|
||||
.map(|el| el.into_dnf())
|
||||
.map(query_grammar::Element::into_dnf)
|
||||
.map(|fnd| {
|
||||
if fnd.len() == 1 {
|
||||
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
||||
|
||||
@@ -2,15 +2,17 @@ use common::BitSet;
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use error::TantivyError;
|
||||
use query::BitSetDocSet;
|
||||
use query::explanation::does_not_match;
|
||||
use query::ConstScorer;
|
||||
use query::{BitSetDocSet, Explanation};
|
||||
use query::{Query, Scorer, Weight};
|
||||
use schema::Type;
|
||||
use schema::{Field, IndexRecordOption, Term};
|
||||
use std::collections::Bound;
|
||||
use std::ops::Range;
|
||||
use termdict::{TermDictionary, TermStreamer};
|
||||
use Result;
|
||||
use DocId;
|
||||
use {Result, SkipResult};
|
||||
|
||||
fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
@@ -286,6 +288,14 @@ impl Weight for RangeWeight {
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(doc_bitset)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
Ok(Explanation::new("RangeQuery", 1.0f32))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -98,4 +98,20 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_count_when_there_are_deletes() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a b"));
|
||||
index_writer.add_document(doc!(text_field=>"a c"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
index_writer.commit().unwrap();
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
|
||||
let reader = index.reader().unwrap();
|
||||
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use docset::{DocSet, SkipResult};
|
||||
use query::Scorer;
|
||||
use query::{Explanation, Scorer};
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -28,11 +28,31 @@ impl TermScorer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TermScorer {
|
||||
pub fn term_freq(&self) -> u32 {
|
||||
self.postings.term_freq()
|
||||
}
|
||||
|
||||
pub fn fieldnorm_id(&self) -> u8 {
|
||||
self.fieldnorm_reader.fieldnorm_id(self.doc())
|
||||
}
|
||||
|
||||
pub fn explain(&self) -> Explanation {
|
||||
let fieldnorm_id = self.fieldnorm_id();
|
||||
let term_freq = self.term_freq();
|
||||
self.similarity_weight.explain(fieldnorm_id, term_freq)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
@@ -40,17 +60,12 @@ impl DocSet for TermScorer {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for TermScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
let doc = self.doc();
|
||||
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
|
||||
self.similarity_weight
|
||||
.score(fieldnorm_id, self.postings.term_freq())
|
||||
let fieldnorm_id = self.fieldnorm_id();
|
||||
let term_freq = self.term_freq();
|
||||
self.similarity_weight.score(fieldnorm_id, term_freq)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,11 +3,13 @@ use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use query::bm25::BM25Weight;
|
||||
use query::Scorer;
|
||||
use query::explanation::does_not_match;
|
||||
use query::Weight;
|
||||
use query::{Explanation, Scorer};
|
||||
use schema::IndexRecordOption;
|
||||
use Result;
|
||||
use DocId;
|
||||
use Term;
|
||||
use {Result, SkipResult};
|
||||
|
||||
pub struct TermWeight {
|
||||
term: Term,
|
||||
@@ -17,37 +19,28 @@ pub struct TermWeight {
|
||||
|
||||
impl Weight for TermWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(Box::new(TermScorer::new(
|
||||
segment_postings,
|
||||
fieldnorm_reader,
|
||||
similarity_weight,
|
||||
)))
|
||||
} else {
|
||||
Ok(Box::new(TermScorer::new(
|
||||
SegmentPostings::empty(),
|
||||
fieldnorm_reader,
|
||||
similarity_weight,
|
||||
)))
|
||||
let term_scorer = self.scorer_specialized(reader)?;
|
||||
Ok(Box::new(term_scorer))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer_specialized(reader)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
Ok(scorer.explain())
|
||||
}
|
||||
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
if reader.num_deleted_docs() == 0 {
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
Ok(self.scorer(reader)?.count(delete_bitset))
|
||||
} else {
|
||||
let field = self.term.field();
|
||||
Ok(reader
|
||||
.inverted_index(field)
|
||||
.get_term_info(&self.term)
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0))
|
||||
} else {
|
||||
Ok(self.scorer(reader)?.count())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -64,4 +57,26 @@ impl TermWeight {
|
||||
similarity_weight,
|
||||
}
|
||||
}
|
||||
|
||||
fn scorer_specialized(&self, reader: &SegmentReader) -> Result<TermScorer> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(TermScorer::new(
|
||||
segment_postings,
|
||||
fieldnorm_reader,
|
||||
similarity_weight,
|
||||
))
|
||||
} else {
|
||||
Ok(TermScorer::new(
|
||||
SegmentPostings::empty(),
|
||||
fieldnorm_reader,
|
||||
similarity_weight,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||
fn refill(&mut self) -> bool {
|
||||
if let Some(min_doc) = self.docsets.iter_mut().map(|docset| docset.doc()).min() {
|
||||
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
|
||||
self.offset = min_doc;
|
||||
self.cursor = 0;
|
||||
refill(
|
||||
@@ -145,7 +145,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||
.iter()
|
||||
.map(|bitset| bitset.len())
|
||||
@@ -163,6 +163,8 @@ where
|
||||
count
|
||||
}
|
||||
|
||||
// TODO implement `count` efficiently.
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
@@ -300,7 +302,7 @@ mod tests {
|
||||
count += 1;
|
||||
}
|
||||
assert!(!union_expected.advance());
|
||||
assert_eq!(count, make_union().count());
|
||||
assert_eq!(count, make_union().count_including_deleted());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::Scorer;
|
||||
use core::SegmentReader;
|
||||
use Result;
|
||||
use query::Explanation;
|
||||
use {DocId, Result};
|
||||
|
||||
/// A Weight is the specialization of a Query
|
||||
/// for a given set of segments.
|
||||
@@ -11,8 +12,16 @@ pub trait Weight: Send + Sync + 'static {
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
||||
|
||||
/// Returns an `Explanation` for the given document.
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation>;
|
||||
|
||||
/// Returns the number documents within the given `SegmentReader`.
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
Ok(self.scorer(reader)?.count())
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
Ok(scorer.count(delete_bitset))
|
||||
} else {
|
||||
Ok(scorer.count_including_deleted())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ impl Document {
|
||||
self.field_values
|
||||
.iter()
|
||||
.filter(|field_value| field_value.field() == field)
|
||||
.map(|field_value| field_value.value())
|
||||
.map(FieldValue::value)
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -137,7 +137,7 @@ impl Document {
|
||||
self.field_values
|
||||
.iter()
|
||||
.find(|field_value| field_value.field() == field)
|
||||
.map(|field_value| field_value.value())
|
||||
.map(FieldValue::value)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ use schema::{IntOptions, TextOptions};
|
||||
|
||||
use schema::Facet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::TextFieldIndexing;
|
||||
use schema::Value;
|
||||
use serde_json::Value as JsonValue;
|
||||
|
||||
@@ -94,7 +95,7 @@ impl FieldType {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| indexing_options.index_option()),
|
||||
.map(TextFieldIndexing::index_option),
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::Date(ref int_options) => {
|
||||
|
||||
@@ -130,7 +130,16 @@ impl SchemaBuilder {
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a fast bytes field to the schema
|
||||
/// Adds a fast bytes field to the schema.
|
||||
///
|
||||
/// Bytes field are not searchable and are only used
|
||||
/// as fast field, to associate any kind of payload
|
||||
/// to a document.
|
||||
///
|
||||
/// For instance, learning-to-rank often requires to access
|
||||
/// some document features at scoring time.
|
||||
/// These can be serializing and stored as a bytes field to
|
||||
/// get access rapidly when scoring each document.
|
||||
pub fn add_bytes_field(&mut self, field_name: &str) -> Field {
|
||||
let field_entry = FieldEntry::new_bytes(field_name.to_string());
|
||||
self.add_field(field_entry)
|
||||
@@ -224,7 +233,7 @@ impl Schema {
|
||||
let field_name = self.get_field_name(field);
|
||||
let values: Vec<Value> = field_values
|
||||
.into_iter()
|
||||
.map(|field_val| field_val.value())
|
||||
.map(FieldValue::value)
|
||||
.cloned()
|
||||
.collect();
|
||||
field_map.insert(field_name.to_string(), values);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use htmlescape::encode_minimal;
|
||||
use query::Query;
|
||||
use schema::Field;
|
||||
use schema::Value;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
@@ -303,7 +304,7 @@ impl SnippetGenerator {
|
||||
let text: String = doc
|
||||
.get_all(self.field)
|
||||
.into_iter()
|
||||
.flat_map(|val| val.text())
|
||||
.flat_map(Value::text)
|
||||
.collect::<Vec<&str>>()
|
||||
.join(" ");
|
||||
self.snippet(&text)
|
||||
|
||||
@@ -227,7 +227,7 @@ pub struct PerFieldSpaceUsage {
|
||||
|
||||
impl PerFieldSpaceUsage {
|
||||
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
||||
let total = fields.values().map(|x| x.total()).sum();
|
||||
let total = fields.values().map(FieldUsage::total).sum();
|
||||
PerFieldSpaceUsage { fields, total }
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ const BLOCK_SIZE: usize = 16_384;
|
||||
/// the store is written to disc as document as being added,
|
||||
/// as opposed to when the segment is getting finalized.
|
||||
///
|
||||
/// The skip list index on the other hand, is build in memory.
|
||||
/// The skip list index on the other hand, is built in memory.
|
||||
///
|
||||
pub struct StoreWriter {
|
||||
doc: DocId,
|
||||
|
||||
4064
src/tokenizer/ascii_folding_filter.rs
Normal file
4064
src/tokenizer/ascii_folding_filter.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -44,18 +44,17 @@ where
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
if self.token_mut().text.is_ascii() {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
} else {
|
||||
false
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
if self.token_mut().text.is_ascii() {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -97,6 +97,8 @@
|
||||
//! If you built your schema programmatically, a complete example
|
||||
//! could like this for instance.
|
||||
//!
|
||||
//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
@@ -129,6 +131,7 @@
|
||||
//! ```
|
||||
//!
|
||||
mod alphanum_only;
|
||||
mod ascii_folding_filter;
|
||||
mod facet_tokenizer;
|
||||
mod lower_caser;
|
||||
mod ngram_tokenizer;
|
||||
@@ -142,6 +145,7 @@ mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
pub use self::facet_tokenizer::FacetTokenizer;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
@@ -157,6 +161,13 @@ pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
/// Maximum authorized len (in bytes) for a token.
|
||||
///
|
||||
/// Tokenizer are in charge of not emitting tokens larger than this value.
|
||||
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
|
||||
/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
|
||||
pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::{
|
||||
@@ -228,27 +239,27 @@ pub mod tests {
|
||||
fn test_non_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"es_stem",
|
||||
"el_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Spanish)),
|
||||
.filter(Stemmer::new(Language::Greek)),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Hola, feliz contribuyente!")
|
||||
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hola", 0, 4);
|
||||
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
||||
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
||||
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
||||
assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
|
||||
assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -29,12 +29,9 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
||||
|
||||
impl TokenStream for RawTokenStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.has_token {
|
||||
self.has_token = false;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
let result = self.has_token;
|
||||
self.has_token = false;
|
||||
result
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
|
||||
@@ -91,7 +91,6 @@ where
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,23 +38,16 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.token.text.clear();
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some((offset_from, c)) => {
|
||||
if c.is_alphanumeric() {
|
||||
let offset_to = self.search_token_end();
|
||||
self.token.offset_from = offset_from;
|
||||
self.token.offset_to = offset_to;
|
||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return false;
|
||||
}
|
||||
while let Some((offset_from, c)) = self.chars.next() {
|
||||
if c.is_alphanumeric() {
|
||||
let offset_to = self.search_token_end();
|
||||
self.token.offset_from = offset_from;
|
||||
self.token.offset_to = offset_to;
|
||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
@@ -57,14 +56,14 @@ impl Language {
|
||||
/// Tokens are expected to be lowercased beforehand.
|
||||
#[derive(Clone)]
|
||||
pub struct Stemmer {
|
||||
stemmer_algorithm: Arc<Algorithm>,
|
||||
stemmer_algorithm: Algorithm,
|
||||
}
|
||||
|
||||
impl Stemmer {
|
||||
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||
pub fn new(language: Language) -> Stemmer {
|
||||
Stemmer {
|
||||
stemmer_algorithm: Arc::new(language.algorithm()),
|
||||
stemmer_algorithm: language.algorithm(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,7 +82,7 @@ where
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
||||
}
|
||||
}
|
||||
@@ -109,15 +108,14 @@ where
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
// TODO remove allocation
|
||||
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
||||
self.token_mut().text.clear();
|
||||
self.token_mut().text.push_str(&stemmed_str);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
// TODO remove allocation
|
||||
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
||||
self.token_mut().text.clear();
|
||||
self.token_mut().text.push_str(&stemmed_str);
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -104,7 +104,6 @@ where
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Deref;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tokenizer::box_tokenizer;
|
||||
use tokenizer::stemmer::Language;
|
||||
@@ -46,7 +47,8 @@ impl TokenizerManager {
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.get(tokenizer_name)
|
||||
.map(|boxed_tokenizer| boxed_tokenizer.boxed_clone())
|
||||
.map(Deref::deref)
|
||||
.map(BoxedTokenizer::boxed_clone)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user