Compare commits

...

21 Commits

Author SHA1 Message Date
Paul Masurel
8a3e520bfe Bumping bitpacking version 2019-06-26 09:54:11 +09:00
Antoine Catton
bcd7386fc5 Add crates.io shield to the README (#572) 2019-06-18 11:19:06 +09:00
Paul Masurel
c23a7c992b Closes #552 (#570)
The different handles to `SegmentMeta` are closed before calling gc on
end_merge.
2019-06-16 14:12:13 +09:00
Paul Masurel
2a88094ec4 Disabling travis on OSX (#571) 2019-06-16 14:12:01 +09:00
Paul Masurel
ca3cfddab4 adding cond (#568) 2019-06-16 11:59:26 +09:00
Paul Masurel
7bd9f9773b trying to fix doc upload (#567) 2019-06-16 11:22:51 +09:00
Paul Masurel
e2da92fcb5 Petr tik n510 clear index (#566)
* Enables clearing the index

Closes #510

* Adds an examples to clear and rebuild index

* Addressing code review

Moved the example from examples/ to docstring above `clear`

* Corrected minor typos and missed/duplicate words

* Added stamper.revert method to be used for rollback

Added type alias for Opstamp

Moved to AtomicU64 on stable rust (since 1.34)

* Change the method name and doc-string

* Remove rollback from delete_all_documents

test_add_then_delete_all_documents fails with --test-threads 2

* Passes all the tests with any number of test-threads

(ran locally 5 times)

* Addressed code review

Deleted comments with debug info
changed ReloadPolicy to Manual

* Removing useless garbage_collect call and updated CHANGELOG
2019-06-12 09:40:03 +09:00
petr-tik
876e1451c4 Resume uploading docs to gh-pages (#565)
* Fixes #546

Generate docs and upload them. Need GH_TOKEN env var to be set in travis settings

* Investigate what TRAVIS* env vars are set
2019-06-12 09:30:09 +09:00
dependabot-preview[bot]
a37d2f9777 Update winapi requirement from 0.2 to 0.3 (#537)
* Update winapi requirement from 0.2 to 0.3

Updates the requirements on [winapi](https://github.com/retep998/winapi-rs) to permit the latest version.
- [Release notes](https://github.com/retep998/winapi-rs/releases)
- [Commits](https://github.com/retep998/winapi-rs/commits/0.3.7)

Signed-off-by: dependabot[bot] <support@dependabot.com>

* Fixing upgrading winapi (hopefully).
2019-06-06 10:23:13 +09:00
Paul Masurel
4822940b19 Issue/36 (#559)
* Added explanation

* Explain

* Splitting weight and idf

* Added comments

Closes #36
2019-06-06 10:03:54 +09:00
Paul Masurel
d590f4c6b0 Comments for IndexMeta (#560) 2019-06-06 09:24:31 +09:00
Paul Masurel
edfa619519 Update .travis.yml 2019-05-29 16:45:56 +09:00
Paul Masurel
96f194635f Trying to address #546 2019-05-29 09:17:41 +09:00
Paul Masurel
444662485f Remove mut in add_document and delete_term. Made stamper ordering rel… (#551)
* Remove mut in add_document and delete_term. Made stamper ordering relaxed.

* Made batch operations &mut self -> &self

* Added example
2019-05-28 10:26:00 +09:00
Stephen Carman
943c25d0f8 Make IndexMeta public (#553) 2019-05-28 09:27:49 +09:00
Paul Masurel
5c0b2a4579 Merge branch 'stamper_refactor' 2019-05-08 10:02:02 +09:00
Paul Masurel
9870a9258d Removed the mutex implementation of AtomicU64.
Fixed comment
2019-05-08 09:59:28 +09:00
petr-tik
da46913839 Merge branch 'master' into stamper_refactor 2019-04-30 22:28:48 +01:00
petr-tik
8ffae47854 Addressed code review
moved Opstamp to top-level namespace, added a docstring

Corrected minor typos/whitespace
2019-04-29 21:23:28 +01:00
petr-tik
1a90a1f3b0 Merge branch 'master' of github.com:tantivy-search/tantivy into stamper_refactor 2019-04-26 08:47:12 +01:00
petr-tik
8e50921363 Tidied up the Stamper module and upgraded to a 1.34 dependency
Added stamper.revert method to be used for rollback - rolling back to a previous
commit in case of deleting all documents or rolling operations back should reset
the stamper as well

Added type alias for Opstamp - helps code readibility instead of seeing u64
returned by functions.

Moved to AtomicU64 on stable rust (since 1.34) - where possible use standard
library interfaces.
2019-04-24 20:46:28 +01:00
42 changed files with 1048 additions and 235 deletions

View File

@@ -10,7 +10,7 @@ env:
global:
- CRATE_NAME=tantivy
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
@@ -38,12 +38,12 @@ matrix:
# Linux
#- env: TARGET=aarch64-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 UPLOAD_DOCS=1
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
# OSX
- env: TARGET=x86_64-apple-darwin
os: osx
#- env: TARGET=x86_64-apple-darwin
# os: osx
before_install:
- set -e
@@ -52,6 +52,7 @@ before_install:
install:
- sh ci/install.sh
- source ~/.cargo/env || true
- env | grep "TRAVIS"
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
@@ -64,6 +65,11 @@ script:
before_deploy:
- sh ci/before_deploy.sh
after_success:
# Needs GH_TOKEN env var to be set in travis settings
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
cache: cargo
before_cache:
# Travis can't cache files that are not readable by "others"

View File

@@ -5,18 +5,27 @@ Tantivy 0.10.0
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
All segments are simply removed.
Minor
---------
- Small simplification of the code.
Calling .freq() or .doc() when .advance() has never
Calling .freq() or .doc() when .advance() has never been called
on segment postings should panic from now on.
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
- Fast fields are now preloaded when the `SegmentReader` is created.
- `IndexMeta` is now public. (@hntd187)
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
only require a read lock. (@pmasurel)
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
## How to update?
Your existing indexes are usable as is. Your may or may need some
Your existing indexes are usable as is, but you may need some
trivial updates.
### Fast fields

View File

@@ -42,7 +42,7 @@ owning_ref = "0.4"
stable_deref_trait = "1.0.0"
rust-stemmers = "1.1"
downcast-rs = { version="1.0" }
bitpacking = "0.6"
bitpacking = "0.7"
census = "0.2"
fnv = "1.0.6"
owned-read = "0.4"
@@ -54,7 +54,7 @@ murmurhash32 = "0.2"
chrono = "0.4"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
winapi = "0.3"
[dev-dependencies]
rand = "0.6"

View File

@@ -4,6 +4,7 @@
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)

View File

@@ -0,0 +1,107 @@
// # Indexing from different threads.
//
// It is fairly common to have to index from different threads.
// Tantivy forbids to create more than one `IndexWriter` at a time.
//
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
// indexing threads will not help. However, it can still be useful for some applications.
//
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
// your application, it is reasonable to have multiple threads.
//
// Another very common reason to want to index from multiple threads, is implementing a webserver
// with CRUD capabilities. The server framework will most likely handle request from
// different threads.
//
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
// `Arc<RwLock<IndexWriter>>`.
//
// While this is counterintuitive, adding and deleting documents do not require mutability
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
//
// The example below does not represent an actual real-life use case (who would spawn thread to
// index a single document?), but aims at demonstrating the mechanism that makes indexing
// from several threads possible.
extern crate tempdir;
// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use std::sync::{Arc, RwLock};
use std::thread;
use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Opstamp;
use tantivy::{Index, IndexWriter};
fn main() -> tantivy::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT | STORED);
let body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
// # First indexing thread.
let index_writer_clone_1 = index_writer.clone();
thread::spawn(move || {
// we index 100 times the document... for the sake of the example.
for i in 0..100 {
let opstamp = {
// A read lock is sufficient here.
let index_writer_rlock = index_writer_clone_1.read().unwrap();
index_writer_rlock.add_document(
doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))
};
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20));
}
});
// # Second indexing thread.
let index_writer_clone_2 = index_writer.clone();
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
thread::spawn(move || {
// we index 100 times the document... for the sake of the example.
for i in 0..100 {
// A read lock is sufficient here.
let opstamp = {
let index_writer_rlock = index_writer_clone_2.read().unwrap();
index_writer_rlock.add_document(doc!(
title => "Manufacturing consent",
body => "Some great book description..."
))
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10));
}
});
// # In the main thread, we commit 10 times, once every 500ms.
for _ in 0..10 {
let opstamp: Opstamp = {
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit().unwrap()
};
println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500));
}
Ok(())
}

View File

@@ -537,4 +537,35 @@ mod tests {
}
assert_eq!(count, 2);
}
#[test]
fn garbage_collect_works_as_intended() {
let directory = RAMDirectory::create();
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema).unwrap();
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i));
}
writer.commit().unwrap();
let mem_right_after_commit = directory.total_mem_usage();
thread::sleep(Duration::from_millis(1_000));
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 8_000);
writer.wait_merging_threads().unwrap();
let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 8_000);
assert!(mem_right_after_merge_finished < mem_right_after_commit);
}
}

View File

@@ -2,6 +2,7 @@ use core::SegmentMeta;
use schema::Schema;
use serde_json;
use std::fmt;
use Opstamp;
/// Meta information about the `Index`.
///
@@ -13,14 +14,27 @@ use std::fmt;
///
#[derive(Clone, Serialize, Deserialize)]
pub struct IndexMeta {
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
pub segments: Vec<SegmentMeta>,
/// Index `Schema`
pub schema: Schema,
pub opstamp: u64,
/// Opstamp associated to the last `commit` operation.
pub opstamp: Opstamp,
#[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit.
///
/// Upon commit, clients can optionally add a small `Striing` payload to their commit
/// to help identify this commit.
/// This payload is entirely unused by tantivy.
pub payload: Option<String>,
}
impl IndexMeta {
/// Create an `IndexMeta` object representing a brand new `Index`
/// with the given index.
///
/// This new index does not contains any segments.
/// Opstamp will the value `0u64`.
pub fn with_schema(schema: Schema) -> IndexMeta {
IndexMeta {
segments: vec![],

View File

@@ -10,6 +10,7 @@ use schema::Schema;
use std::fmt;
use std::path::PathBuf;
use std::result;
use Opstamp;
use Result;
/// A segment is a piece of the index.
@@ -50,7 +51,7 @@ impl Segment {
}
#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
Segment {
index: self.index,
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),

View File

@@ -5,6 +5,7 @@ use serde;
use std::collections::HashSet;
use std::fmt;
use std::path::PathBuf;
use Opstamp;
lazy_static! {
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
@@ -13,7 +14,7 @@ lazy_static! {
#[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta {
num_deleted_docs: u32,
opstamp: u64,
opstamp: Opstamp,
}
/// `SegmentMeta` contains simple meta information about a segment.
@@ -136,9 +137,9 @@ impl SegmentMeta {
self.max_doc() - self.num_deleted_docs()
}
/// Returns the opstamp of the last delete operation
/// Returns the `Opstamp` of the last delete operation
/// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<u64> {
pub fn delete_opstamp(&self) -> Option<Opstamp> {
self.tracked
.deletes
.as_ref()
@@ -152,7 +153,7 @@ impl SegmentMeta {
}
#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
let delete_meta = DeleteMeta {
num_deleted_docs,
opstamp,

View File

@@ -246,7 +246,7 @@ impl SegmentReader {
let termdict_source = self
.termdict_composite
.open_read(field)
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
.expect("Failed to open field term dictionary in composite file. Is the field indexed?");
let positions_source = self
.positions_composite

View File

@@ -48,14 +48,14 @@ impl RetryPolicy {
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is released automatically on `Drop`.
pub struct DirectoryLock(Box<Drop + Send + 'static>);
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
struct DirectoryLockGuard {
directory: Box<Directory>,
path: PathBuf,
}
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
fn from(underlying: Box<T>) -> Self {
DirectoryLock(underlying)
}

View File

@@ -6,7 +6,7 @@ use std::path::PathBuf;
/// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)]
pub enum LockError {
/// Failed to acquired a lock as it is already hold by another
/// Failed to acquired a lock as it is already held by another
/// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.

View File

@@ -320,7 +320,7 @@ impl MmapDirectory {
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
use winapi::um::winbase;
open_opts
.write(true)

View File

@@ -103,6 +103,10 @@ impl InnerDirectory {
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
self.watch_router.subscribe(watch_handle)
}
fn total_mem_usage(&self) -> usize {
self.fs.values().map(|f| f.len()).sum()
}
}
impl fmt::Debug for RAMDirectory {
@@ -126,6 +130,12 @@ impl RAMDirectory {
pub fn create() -> RAMDirectory {
Self::default()
}
/// Returns the sum of the size of the different files
/// in the RAMDirectory.
pub fn total_mem_usage(&self) -> usize {
self.fs.read().unwrap().total_mem_usage()
}
}
impl Directory for RAMDirectory {

View File

@@ -2,6 +2,7 @@ use super::operation::DeleteOperation;
use std::mem;
use std::ops::DerefMut;
use std::sync::{Arc, RwLock};
use Opstamp;
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.
@@ -184,7 +185,7 @@ impl DeleteCursor {
/// queue are consume and the next get will return None.
/// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: u64) {
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
// TODO Can be optimize as we work with block.
while self.is_behind_opstamp(target_opstamp) {
self.advance();
@@ -192,7 +193,7 @@ impl DeleteCursor {
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)
.unwrap_or(false)

View File

@@ -1,5 +1,6 @@
use std::sync::Arc;
use DocId;
use Opstamp;
// Doc to opstamp is used to identify which
// document should be deleted.
@@ -23,7 +24,7 @@ pub enum DocToOpstampMapping {
}
impl From<Vec<u64>> for DocToOpstampMapping {
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(Arc::new(opstamps))
}
}
@@ -35,7 +36,7 @@ impl DocToOpstampMapping {
//
// The edge case opstamp = some doc opstamp is in practise
// never called.
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {

View File

@@ -30,6 +30,7 @@ use std::ops::Range;
use std::sync::Arc;
use std::thread;
use std::thread::JoinHandle;
use Opstamp;
use Result;
// Size of the margin for the heap. A segment is closed when the remaining memory
@@ -99,7 +100,7 @@ pub struct IndexWriter {
delete_queue: DeleteQueue,
stamper: Stamper,
committed_opstamp: u64,
committed_opstamp: Opstamp,
}
/// Open a new index writer. Attempts to acquire a lockfile.
@@ -177,7 +178,7 @@ pub fn compute_deleted_bitset(
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64,
target_opstamp: Opstamp,
) -> Result<bool> {
let mut might_have_changed = false;
@@ -219,7 +220,7 @@ pub fn compute_deleted_bitset(
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64,
target_opstamp: Opstamp,
) -> Result<()> {
{
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
@@ -299,11 +300,11 @@ fn index_documents(
// the worker thread.
assert!(num_docs > 0);
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
let segment_meta = SegmentMeta::new(segment_id, num_docs);
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
let delete_bitset_opt = if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
@@ -331,7 +332,8 @@ fn index_documents(
}
impl IndexWriter {
/// The index writer
/// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> Result<()> {
// this will stop the indexing thread,
// dropping the last reference to the segment_updater.
@@ -382,7 +384,6 @@ impl IndexWriter {
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.operation_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
@@ -461,6 +462,52 @@ impl IndexWriter {
self.segment_updater.garbage_collect_files()
}
/// Deletes all documents from the index
///
/// Requires `commit`ing
/// Enables users to rebuild the index,
/// by clearing and resubmitting necessary documents
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::query::QueryParser;
/// use tantivy::collector::TopDocs;
/// use tantivy::schema::*;
/// use tantivy::Index;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT | STORED);
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema.clone());
///
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
/// index_writer.commit()?;
///
/// let clear_res = index_writer.delete_all_documents().unwrap();
/// // have to commit, otherwise deleted terms remain available
/// index_writer.commit()?;
///
/// let searcher = index.reader()?.searcher();
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query_promo = query_parser.parse_query("Promotheus")?;
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
///
/// assert!(top_docs_promo.is_empty());
/// Ok(())
/// }
/// ```
pub fn delete_all_documents(&mut self) -> Result<Opstamp> {
// Delete segments
self.segment_updater.remove_all_segments();
// Return new stamp - reverted stamp
self.stamper.revert(self.committed_opstamp);
Ok(self.committed_opstamp)
}
/// Merges a given list of segments
///
/// `segment_ids` is required to be non-empty.
@@ -488,19 +535,22 @@ impl IndexWriter {
/// Rollback to the last commit
///
/// This cancels all of the update that
/// happened before after the last commit.
/// This cancels all of the updates that
/// happened after the last commit.
/// After calling rollback, the index is in the same
/// state as it was after the last commit.
///
/// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> Result<()> {
pub fn rollback(&mut self) -> Result<Opstamp> {
info!("Rolling back to opstamp {}", self.committed_opstamp);
self.rollback_impl()
}
/// Private, implementation of rollback
fn rollback_impl(&mut self) -> Result<Opstamp> {
// marks the segment updater as killed. From now on, all
// segment updates will be ignored.
self.segment_updater.kill();
let document_receiver = self.operation_receiver.clone();
// take the directory lock to create a new index_writer.
@@ -529,7 +579,7 @@ impl IndexWriter {
// was dropped with the index_writer.
for _ in document_receiver.clone() {}
Ok(())
Ok(self.committed_opstamp)
}
/// Prepares a commit.
@@ -567,7 +617,7 @@ impl IndexWriter {
info!("Preparing commit");
// this will drop the current document channel
// and recreate a new one channels.
// and recreate a new one.
self.recreate_document_channel();
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
@@ -601,7 +651,7 @@ impl IndexWriter {
/// Commit returns the `opstamp` of the last document
/// that made it in the commit.
///
pub fn commit(&mut self) -> Result<u64> {
pub fn commit(&mut self) -> Result<Opstamp> {
self.prepare_commit()?.commit()
}
@@ -617,7 +667,7 @@ impl IndexWriter {
///
/// Like adds, the deletion itself will be visible
/// only after calling `commit()`.
pub fn delete_term(&mut self, term: Term) -> u64 {
pub fn delete_term(&self, term: Term) -> Opstamp {
let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation);
@@ -631,7 +681,7 @@ impl IndexWriter {
///
/// This is also the opstamp of the commit that is currently
/// available for searchers.
pub fn commit_opstamp(&self) -> u64 {
pub fn commit_opstamp(&self) -> Opstamp {
self.committed_opstamp
}
@@ -645,7 +695,7 @@ impl IndexWriter {
///
/// Currently it represents the number of documents that
/// have been added since the creation of the index.
pub fn add_document(&mut self, document: Document) -> u64 {
pub fn add_document(&self, document: Document) -> Opstamp {
let opstamp = self.stamper.stamp();
let add_operation = AddOperation { opstamp, document };
let send_result = self.operation_sender.send(vec![add_operation]);
@@ -662,7 +712,7 @@ impl IndexWriter {
/// The total number of stamps generated by this method is `count + 1`;
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
/// is for the batch itself.
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1;
let stamps = Range {
@@ -688,7 +738,7 @@ impl IndexWriter {
/// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`.
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
let count = user_operations.len() as u64;
if count == 0 {
return self.stamper.stamp();
@@ -739,7 +789,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let operations = vec![
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
@@ -801,7 +851,7 @@ mod tests {
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer(3_000_000).unwrap();
let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1);
assert_eq!(batch_opstamp1, 0u64);
@@ -1048,4 +1098,145 @@ mod tests {
assert_eq!(num_docs_containing("b"), 0);
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
}
#[test]
fn test_add_then_delete_all_documents() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| {
reader.reload().unwrap();
let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term)
};
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
let commit_tstamp = index_writer.commit().unwrap();
assert!(commit_tstamp > add_tstamp);
index_writer.delete_all_documents().unwrap();
index_writer.commit().unwrap();
// Search for documents with the same term that we added
assert_eq!(num_docs_containing("a"), 0);
}
#[test]
fn test_delete_all_documents_rollback_correct_stamp() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
// commit documents - they are now available
let first_commit = index_writer.commit();
assert!(first_commit.is_ok());
let first_commit_tstamp = first_commit.unwrap();
assert!(first_commit_tstamp > add_tstamp);
// delete_all_documents the index
let clear_tstamp = index_writer.delete_all_documents().unwrap();
assert_eq!(clear_tstamp, add_tstamp);
// commit the clear command - now documents aren't available
let second_commit = index_writer.commit();
assert!(second_commit.is_ok());
let second_commit_tstamp = second_commit.unwrap();
// add new documents again
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
}
// rollback to last commit, when index was empty
let rollback = index_writer.rollback();
assert!(rollback.is_ok());
let rollback_tstamp = rollback.unwrap();
assert_eq!(rollback_tstamp, second_commit_tstamp);
// working with an empty index == no documents
let term_b = Term::from_field_text(text_field, "b");
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0);
}
#[test]
fn test_delete_all_documents_then_add() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let res = index_writer.delete_all_documents();
assert!(res.is_ok());
assert!(index_writer.commit().is_ok());
// add one simple doc
index_writer.add_document(doc!(text_field => "a"));
assert!(index_writer.commit().is_ok());
let term_a = Term::from_field_text(text_field, "a");
// expect the document with that term to be in the index
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
}
#[test]
fn test_delete_all_documents_and_rollback() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// add one simple doc
index_writer.add_document(doc!(text_field => "a"));
let comm = index_writer.commit();
assert!(comm.is_ok());
let commit_tstamp = comm.unwrap();
// clear but don't commit!
let clear_tstamp = index_writer.delete_all_documents().unwrap();
// clear_tstamp should reset to before the last commit
assert!(clear_tstamp < commit_tstamp);
// rollback
let _rollback_tstamp = index_writer.rollback().unwrap();
// Find original docs in the index
let term_a = Term::from_field_text(text_field, "a");
// expect the document with that term to be in the index
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
}
#[test]
fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let clear = index_writer.delete_all_documents();
let commit = index_writer.commit();
assert!(clear.is_ok());
assert!(commit.is_ok());
}
#[test]
fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let clear = index_writer.delete_all_documents();
let commit = index_writer.commit();
assert!(clear.is_ok());
assert!(commit.is_ok());
let clear_again = index_writer.delete_all_documents();
let commit_again = index_writer.commit();
assert!(clear_again.is_ok());
assert!(commit_again.is_ok());
}
}

View File

@@ -1,5 +1,6 @@
use census::{Inventory, TrackedObject};
use std::collections::HashSet;
use Opstamp;
use SegmentId;
#[derive(Default)]
@@ -17,8 +18,8 @@ impl MergeOperationInventory {
}
}
/// A `MergeOperation` has two role.
/// It carries all of the information required to describe a merge :
/// A `MergeOperation` has two roles.
/// It carries all of the information required to describe a merge:
/// - `target_opstamp` is the opstamp up to which we want to consume the
/// delete queue and reflect their deletes.
/// - `segment_ids` is the list of segment to be merged.
@@ -35,14 +36,14 @@ pub struct MergeOperation {
}
struct InnerMergeOperation {
target_opstamp: u64,
target_opstamp: Opstamp,
segment_ids: Vec<SegmentId>,
}
impl MergeOperation {
pub fn new(
inventory: &MergeOperationInventory,
target_opstamp: u64,
target_opstamp: Opstamp,
segment_ids: Vec<SegmentId>,
) -> MergeOperation {
let inner_merge_operation = InnerMergeOperation {
@@ -54,7 +55,7 @@ impl MergeOperation {
}
}
pub fn target_opstamp(&self) -> u64 {
pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp
}

View File

@@ -1,17 +1,18 @@
use schema::Document;
use schema::Term;
use Opstamp;
/// Timestamped Delete operation.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct DeleteOperation {
pub opstamp: u64,
pub opstamp: Opstamp,
pub term: Term,
}
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation {
pub opstamp: u64,
pub opstamp: Opstamp,
pub document: Document,
}

View File

@@ -1,15 +1,16 @@
use super::IndexWriter;
use Opstamp;
use Result;
/// A prepared commit
pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
payload: Option<String>,
opstamp: u64,
opstamp: Opstamp,
}
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit {
PreparedCommit {
index_writer,
payload: None,
@@ -17,7 +18,7 @@ impl<'a> PreparedCommit<'a> {
}
}
pub fn opstamp(&self) -> u64 {
pub fn opstamp(&self) -> Opstamp {
self.opstamp
}
@@ -25,11 +26,11 @@ impl<'a> PreparedCommit<'a> {
self.payload = Some(payload.to_string())
}
pub fn abort(self) -> Result<()> {
pub fn abort(self) -> Result<Opstamp> {
self.index_writer.rollback()
}
pub fn commit(self) -> Result<u64> {
pub fn commit(self) -> Result<Opstamp> {
info!("committing {}", self.opstamp);
self.index_writer
.segment_updater()

View File

@@ -118,6 +118,12 @@ impl SegmentManager {
});
}
pub(crate) fn remove_all_segments(&self) {
let mut registers_lock = self.write();
registers_lock.committed.clear();
registers_lock.uncommitted.clear();
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
registers_lock.committed.clear();

View File

@@ -36,14 +36,15 @@ use std::sync::Arc;
use std::sync::RwLock;
use std::thread;
use std::thread::JoinHandle;
use Opstamp;
use Result;
/// Save the index meta file.
/// This operation is atomic :
/// Either
// - it fails, in which case an error is returned,
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
@@ -69,6 +70,7 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
///
/// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
@@ -212,6 +214,11 @@ impl SegmentUpdater {
}
}
/// Orders `SegmentManager` to remove all segments
pub(crate) fn remove_all_segments(&self) {
self.0.segment_manager.remove_all_segments();
}
pub fn kill(&mut self) {
self.0.killed.store(true, Ordering::Release);
}
@@ -222,9 +229,9 @@ impl SegmentUpdater {
/// Apply deletes up to the target opstamp to all segments.
///
/// Tne method returns copies of the segment entries,
/// The method returns copies of the segment entries,
/// updated with the delete information.
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
let mut segment_entries = self.0.segment_manager.segment_entries();
for segment_entry in &mut segment_entries {
let segment = self.0.index.segment(segment_entry.meta().clone());
@@ -233,7 +240,7 @@ impl SegmentUpdater {
Ok(segment_entries)
}
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
if self.is_alive() {
let index = &self.0.index;
let directory = index.directory();
@@ -280,7 +287,7 @@ impl SegmentUpdater {
.garbage_collect(|| self.0.segment_manager.list_files());
}
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
self.run_async(move |segment_updater| {
if segment_updater.is_alive() {
let segment_entries = segment_updater
@@ -445,38 +452,41 @@ impl SegmentUpdater {
) -> Result<()> {
self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(e) =
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
{
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
);
if cfg!(test) {
panic!("Merge failed.");
{
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(e) = advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
);
if cfg!(test) {
panic!("Merge failed.");
}
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
return;
}
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
return;
}
}
}
segment_updater
.0
.segment_manager
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
let previous_metas = segment_updater.load_metas();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
let previous_metas = segment_updater.load_metas();
segment_updater
.0
.segment_manager
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
segment_updater.consider_merge_options();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
} // we drop all possible handle to a now useless `SegmentMeta`.
segment_updater.garbage_collect_files_exec();
})
.wait()
@@ -650,4 +660,31 @@ mod tests {
assert!(index.searchable_segment_metas().unwrap().is_empty());
assert!(reader.searcher().segment_readers().is_empty());
}
#[test]
fn test_remove_all_segments() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
}
index_writer.segment_updater().remove_all_segments();
let seg_vec = index_writer
.segment_updater()
.0
.segment_manager
.segment_entries();
assert!(seg_vec.is_empty());
}
}

View File

@@ -16,10 +16,11 @@ use tokenizer::BoxedTokenizer;
use tokenizer::FacetTokenizer;
use tokenizer::{TokenStream, Tokenizer};
use DocId;
use Opstamp;
use Result;
/// A `SegmentWriter` is in charge of creating segment index from a
/// documents.
/// set of documents.
///
/// They creates the postings list in anonymous memory.
/// The segment is layed on disk when the segment gets `finalized`.
@@ -29,7 +30,7 @@ pub struct SegmentWriter {
segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<u64>,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}

View File

@@ -1,76 +1,39 @@
use std::ops::Range;
use std::sync::atomic::Ordering;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use Opstamp;
// AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform.
#[cfg(target_arch = "x86_64")]
mod archicture_impl {
use std::sync::atomic::{AtomicUsize, Ordering};
#[derive(Default)]
pub struct AtomicU64Ersatz(AtomicUsize);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
}
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val as usize, order) as u64
}
}
}
#[cfg(not(target_arch = "x86_64"))]
mod archicture_impl {
use std::sync::atomic::Ordering;
/// Under other architecture, we rely on a mutex.
use std::sync::RwLock;
#[derive(Default)]
pub struct AtomicU64Ersatz(RwLock<u64>);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(RwLock::new(first_opstamp))
}
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
let previous_val = *lock;
*lock = previous_val + incr;
previous_val
}
}
}
use self::archicture_impl::AtomicU64Ersatz;
/// Stamper provides Opstamps, which is just an auto-increment id to label
/// an operation.
///
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Ersatz>);
pub struct Stamper(Arc<AtomicU64>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
pub fn new(first_opstamp: Opstamp) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
pub fn stamp(&self) -> Opstamp {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
/// Given a desired count `n`, `stamps` returns an iterator that
/// will supply `n` number of u64 stamps.
pub fn stamps(&self, n: u64) -> Range<u64> {
pub fn stamps(&self, n: u64) -> Range<Opstamp> {
let start = self.0.fetch_add(n, Ordering::SeqCst);
Range {
start,
end: start + n,
}
}
/// Reverts the stamper to a given `Opstamp` value and returns it
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
self.0.store(to_opstamp, Ordering::SeqCst);
to_opstamp
}
}
#[cfg(test)]
@@ -92,4 +55,18 @@ mod test {
assert_eq!(stamper.stamps(3u64), (12..15));
assert_eq!(stamper.stamp(), 15u64);
}
#[test]
fn test_stamper_revert() {
let stamper = Stamper::new(7u64);
assert_eq!(stamper.stamp(), 7u64);
assert_eq!(stamper.stamp(), 8u64);
let stamper_clone = stamper.clone();
assert_eq!(stamper_clone.stamp(), 9u64);
stamper.revert(6);
assert_eq!(stamper.stamp(), 6);
assert_eq!(stamper_clone.stamp(), 7);
}
}

View File

@@ -226,7 +226,7 @@ mod docset;
pub use self::docset::{DocSet, SkipResult};
pub use core::SegmentComponent;
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
pub use core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
pub use core::{InvertedIndexReader, SegmentReader};
pub use directory::Directory;
pub use indexer::IndexWriter;
@@ -254,6 +254,16 @@ pub mod merge_policy {
/// as they are added in the segment.
pub type DocId = u32;
/// A u64 assigned to every operation incrementally
///
/// All operations modifying the index receives an monotonic Opstamp.
/// The resulting state of the index is consistent with the opstamp ordering.
///
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
/// with opstamp `n+1`.
pub type Opstamp = u64;
/// A f32 that represents the relevance of the document to the query
///
/// This is modelled internally as a `f32`. The

View File

@@ -1,7 +1,8 @@
use core::Searcher;
use core::SegmentReader;
use docset::DocSet;
use query::{Query, Scorer, Weight};
use query::explanation::does_not_match;
use query::{Explanation, Query, Scorer, Weight};
use DocId;
use Result;
use Score;
@@ -29,6 +30,13 @@ impl Weight for AllWeight {
max_doc: reader.max_doc(),
}))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
if doc >= reader.max_doc() {
return Err(does_not_match(doc));
}
Ok(Explanation::new("AllQuery", 1f32))
}
}
enum State {

View File

@@ -1,12 +1,14 @@
use common::BitSet;
use core::SegmentReader;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{BitSetDocSet, Explanation};
use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use tantivy_fst::Automaton;
use termdict::{TermDictionary, TermStreamer};
use Result;
use DocId;
use TantivyError;
use {Result, SkipResult};
/// A weight struct for Fuzzy Term and Regex Queries
pub struct AutomatonWeight<A>
@@ -56,4 +58,15 @@ where
let doc_bitset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(doc_bitset)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader)?;
if scorer.skip_next(doc) == SkipResult::Reached {
Ok(Explanation::new("AutomatonScorer", 1.0f32))
} else {
Err(TantivyError::InvalidArgument(
"Document does not exist".to_string(),
))
}
}
}

View File

@@ -1,4 +1,5 @@
use fieldnorm::FieldNormReader;
use query::Explanation;
use Score;
use Searcher;
use Term;
@@ -26,18 +27,13 @@ fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
#[derive(Clone)]
pub struct BM25Weight {
idf_explain: Explanation,
weight: f32,
cache: [f32; 256],
average_fieldnorm: f32,
}
impl BM25Weight {
pub fn null() -> BM25Weight {
BM25Weight {
weight: 0f32,
cache: [1f32; 256],
}
}
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
assert!(!terms.is_empty(), "BM25 requires at least one term");
let field = terms[0].field();
@@ -58,20 +54,37 @@ impl BM25Weight {
}
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
let idf = terms
.iter()
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
BM25Weight::new(idf, average_fieldnorm)
let mut idf_explain: Explanation;
if terms.len() == 1 {
let term_doc_freq = searcher.doc_freq(&terms[0]);
let idf = idf(term_doc_freq, total_num_docs);
idf_explain =
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
idf_explain.add_const(
"n, number of docs containing this term",
term_doc_freq as f32,
);
idf_explain.add_const("N, total number of docs", total_num_docs as f32);
} else {
let idf = terms
.iter()
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
idf_explain = Explanation::new("idf", idf);
}
BM25Weight::new(idf_explain, average_fieldnorm)
}
fn new(idf: f32, average_fieldnorm: f32) -> BM25Weight {
fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight {
let weight = idf_explain.value() * (1f32 + K1);
BM25Weight {
weight: idf * (1f32 + K1),
idf_explain,
weight,
cache: compute_tf_cache(average_fieldnorm),
average_fieldnorm,
}
}
@@ -81,6 +94,37 @@ impl BM25Weight {
let term_freq = term_freq as f32;
self.weight * term_freq / (term_freq + norm)
}
pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation {
// The explain format is directly copied from Lucene's.
// (So, Kudos to Lucene)
let score = self.score(fieldnorm_id, term_freq);
let norm = self.cache[fieldnorm_id as usize];
let term_freq = term_freq as f32;
let right_factor = term_freq / (term_freq + norm);
let mut tf_explanation = Explanation::new(
"freq / (freq + k1 * (1 - b + b * dl / avgdl))",
right_factor,
);
tf_explanation.add_const("freq, occurrences of term within document", term_freq);
tf_explanation.add_const("k1, term saturation parameter", K1);
tf_explanation.add_const("b, length normalization parameter", B);
tf_explanation.add_const(
"dl, length of field",
FieldNormReader::id_to_fieldnorm(fieldnorm_id) as f32,
);
tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm);
let mut explanation = Explanation::new("TermQuery, product of...", score);
explanation.add_detail(Explanation::new("(K1+1)", K1 + 1f32));
explanation.add_detail(self.idf_explain.clone());
explanation.add_detail(tf_explanation);
explanation
}
}
#[cfg(test)]

View File

@@ -1,5 +1,5 @@
use core::SegmentReader;
use query::intersect_scorers;
use query::explanation::does_not_match;
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use query::term_query::TermScorer;
use query::EmptyScorer;
@@ -9,8 +9,10 @@ use query::RequiredOptionalScorer;
use query::Scorer;
use query::Union;
use query::Weight;
use query::{intersect_scorers, Explanation};
use std::collections::HashMap;
use Result;
use {DocId, SkipResult};
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
where
@@ -50,10 +52,10 @@ impl BooleanWeight {
}
}
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
fn per_occur_scorers(
&self,
reader: &SegmentReader,
) -> Result<Box<Scorer>> {
) -> Result<HashMap<Occur, Vec<Box<Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<Scorer>>> = HashMap::new();
for &(ref occur, ref subweight) in &self.weights {
let sub_scorer: Box<Scorer> = subweight.scorer(reader)?;
@@ -62,6 +64,14 @@ impl BooleanWeight {
.or_insert_with(Vec::new)
.push(sub_scorer);
}
Ok(per_occur_scorers)
}
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
&self,
reader: &SegmentReader,
) -> Result<Box<Scorer>> {
let mut per_occur_scorers = self.per_occur_scorers(reader)?;
let should_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
.remove(&Occur::Should)
@@ -118,4 +128,31 @@ impl Weight for BooleanWeight {
self.complex_scorer::<DoNothingCombiner>(reader)
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader)?;
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
if !self.scoring_enabled {
return Ok(Explanation::new("BooleanQuery with no scoring", 1f32));
}
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
for &(ref occur, ref subweight) in &self.weights {
if is_positive_occur(*occur) {
if let Ok(child_explanation) = subweight.explain(reader, doc) {
explanation.add_detail(child_explanation);
}
}
}
Ok(explanation)
}
}
fn is_positive_occur(occur: Occur) -> bool {
match occur {
Occur::Must | Occur::Should => true,
Occur::MustNot => false,
}
}

View File

@@ -18,8 +18,8 @@ mod tests {
use query::Scorer;
use query::TermQuery;
use schema::*;
use DocId;
use Index;
use {DocAddress, DocId};
fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder();
@@ -205,4 +205,167 @@ mod tests {
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
}
}
// motivated by #554
#[test]
fn test_bm25_several_fields() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(
// tf = 1 0
title => "Законы притяжения Оксана Кулакова",
// tf = 1 0
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
));
index_writer.add_document(doc!(
// tf = 1 0
title => "Любимые русские пироги (Оксана Путан)",
// tf = 2 0
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
));
index_writer.add_document(doc!(
// tf = 1 1
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
// tf = 0 0
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
));
for _ in 0..1_000 {
index_writer.add_document(doc!(
title => "a b d e f g",
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, text]);
let query = query_parser
.parse_query("Оксана Лифенко")
.unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
scorer.advance();
let explanation = query.explain(&searcher, DocAddress(0u32, 0u32)).unwrap();
assert_eq!(
explanation.to_pretty_json(),
r#"{
"value": 12.997711,
"description": "BooleanClause. Sum of ...",
"details": [
{
"value": 12.997711,
"description": "BooleanClause. Sum of ...",
"details": [
{
"value": 6.551476,
"description": "TermQuery, product of...",
"details": [
{
"value": 2.2,
"description": "(K1+1)"
},
{
"value": 5.658984,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
"details": [
{
"value": 3.0,
"description": "n, number of docs containing this term"
},
{
"value": 1003.0,
"description": "N, total number of docs"
}
]
},
{
"value": 0.5262329,
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
"details": [
{
"value": 1.0,
"description": "freq, occurrences of term within document"
},
{
"value": 1.2,
"description": "k1, term saturation parameter"
},
{
"value": 0.75,
"description": "b, length normalization parameter"
},
{
"value": 4.0,
"description": "dl, length of field"
},
{
"value": 5.997009,
"description": "avgdl, average length of field"
}
]
}
]
},
{
"value": 6.446235,
"description": "TermQuery, product of...",
"details": [
{
"value": 2.2,
"description": "(K1+1)"
},
{
"value": 5.9954567,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
"details": [
{
"value": 2.0,
"description": "n, number of docs containing this term"
},
{
"value": 1003.0,
"description": "N, total number of docs"
}
]
},
{
"value": 0.4887212,
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
"details": [
{
"value": 1.0,
"description": "freq, occurrences of term within document"
},
{
"value": 1.2,
"description": "k1, term saturation parameter"
},
{
"value": 0.75,
"description": "b, length normalization parameter"
},
{
"value": 20.0,
"description": "dl, length of field"
},
{
"value": 24.123629,
"description": "avgdl, average length of field"
}
]
}
]
}
]
}
]
}"#
);
}
}

View File

@@ -1,6 +1,7 @@
use super::Scorer;
use query::Query;
use query::explanation::does_not_match;
use query::Weight;
use query::{Explanation, Query};
use DocId;
use DocSet;
use Result;
@@ -32,6 +33,10 @@ impl Weight for EmptyWeight {
fn scorer(&self, _reader: &SegmentReader) -> Result<Box<Scorer>> {
Ok(Box::new(EmptyScorer))
}
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
Err(does_not_match(doc))
}
}
/// `EmptyScorer` is a dummy `Scorer` in which no document matches.

51
src/query/explanation.rs Normal file
View File

@@ -0,0 +1,51 @@
use {DocId, TantivyError};
pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
}
/// Object describing the score of a given document.
/// It is organized in trees.
///
/// `.to_pretty_json()` can be useful to print out a human readable
/// representation of this tree when debugging a given score.
#[derive(Clone, Serialize)]
pub struct Explanation {
value: f32,
description: String,
#[serde(skip_serializing_if = "Vec::is_empty")]
details: Vec<Explanation>,
}
impl Explanation {
/// Creates a new explanation object.
pub fn new<T: ToString>(description: T, value: f32) -> Explanation {
Explanation {
value,
description: description.to_string(),
details: vec![],
}
}
/// Returns the value associated to the current node.
pub fn value(&self) -> f32 {
self.value
}
/// Add some detail, explaining some part of the current node formula.
///
/// Details are treated as child of the current node.
pub fn add_detail(&mut self, child_explanation: Explanation) {
self.details.push(child_explanation);
}
/// Shortcut for `self.details.push(Explanation::new(name, value));`
pub fn add_const<T: ToString>(&mut self, name: T, value: f32) {
self.details.push(Explanation::new(name, value));
}
/// Returns an indented json representation of the explanation tree for debug usage.
pub fn to_pretty_json(&self) -> String {
serde_json::to_string_pretty(self).unwrap()
}
}

View File

@@ -9,6 +9,7 @@ mod bm25;
mod boolean_query;
mod empty_query;
mod exclude;
mod explanation;
mod fuzzy_query;
mod intersection;
mod occur;
@@ -39,6 +40,7 @@ pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::explanation::Explanation;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::occur::Occur;

View File

@@ -93,21 +93,12 @@ impl Query for PhraseQuery {
field_name
)));
}
if scoring_enabled {
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
Ok(Box::new(PhraseWeight::new(
self.phrase_terms.clone(),
bm25_weight,
true,
)))
} else {
Ok(Box::new(PhraseWeight::new(
self.phrase_terms.clone(),
BM25Weight::null(),
false,
)))
}
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
let phrase_weight: PhraseWeight =
PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
Ok(Box::new(phrase_weight))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {

View File

@@ -148,9 +148,13 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
}
pub fn phrase_count(&self) -> u32 {
self.phrase_count
}
fn phrase_match(&mut self) -> bool {
if self.score_needed {
let count = self.phrase_count();
let count = self.compute_phrase_count();
self.phrase_count = count;
count > 0u32
} else {
@@ -183,7 +187,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
fn phrase_count(&mut self) -> u32 {
fn compute_phrase_count(&mut self) -> u32 {
{
self.intersection_docset
.docset_mut_specialized(0)

View File

@@ -1,12 +1,16 @@
use super::PhraseScorer;
use core::SegmentReader;
use fieldnorm::FieldNormReader;
use postings::SegmentPostings;
use query::bm25::BM25Weight;
use query::EmptyScorer;
use query::explanation::does_not_match;
use query::Scorer;
use query::Weight;
use query::{EmptyScorer, Explanation};
use schema::IndexRecordOption;
use schema::Term;
use Result;
use {DocId, DocSet};
use {Result, SkipResult};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -27,13 +31,18 @@ impl PhraseWeight {
score_needed,
}
}
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let similarity_weight = self.similarity_weight.clone();
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
let field = self.phrase_terms[0].1.field();
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
reader.get_fieldnorms_reader(field)
}
fn phrase_scorer(
&self,
reader: &SegmentReader,
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.clone();
let fieldnorm_reader = self.fieldnorm_reader(reader);
if reader.has_deletes() {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
@@ -43,10 +52,10 @@ impl Weight for PhraseWeight {
{
term_postings_list.push((offset, postings));
} else {
return Ok(Box::new(EmptyScorer));
return Ok(None);
}
}
Ok(Box::new(PhraseScorer::new(
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
@@ -61,10 +70,10 @@ impl Weight for PhraseWeight {
{
term_postings_list.push((offset, postings));
} else {
return Ok(Box::new(EmptyScorer));
return Ok(None);
}
}
Ok(Box::new(PhraseScorer::new(
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
@@ -73,3 +82,30 @@ impl Weight for PhraseWeight {
}
}
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader)? {
Ok(Box::new(scorer))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
}
let mut scorer = scorer_opt.unwrap();
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
explanation.add_detail(self.similarity_weight.explain(fieldnorm_id, phrase_count));
Ok(explanation)
}
}

View File

@@ -1,10 +1,11 @@
use super::Weight;
use core::searcher::Searcher;
use downcast_rs;
use query::Explanation;
use std::collections::BTreeSet;
use std::fmt;
use Result;
use Term;
use {downcast_rs, DocAddress};
/// The `Query` trait defines a set of documents and a scoring method
/// for those documents.
@@ -48,6 +49,13 @@ pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
/// See [`Weight`](./trait.Weight.html).
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>>;
/// Returns an `Explanation` for the score of the document.
fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> Result<Explanation> {
let reader = searcher.segment_reader(doc_address.segment_ord());
let weight = self.weight(searcher, true)?;
weight.explain(reader, doc_address.doc())
}
/// Returns the number of documents matching the query.
fn count(&self, searcher: &Searcher) -> Result<usize> {
let weight = self.weight(searcher, false)?;

View File

@@ -2,15 +2,17 @@ use common::BitSet;
use core::Searcher;
use core::SegmentReader;
use error::TantivyError;
use query::BitSetDocSet;
use query::explanation::does_not_match;
use query::ConstScorer;
use query::{BitSetDocSet, Explanation};
use query::{Query, Scorer, Weight};
use schema::Type;
use schema::{Field, IndexRecordOption, Term};
use std::collections::Bound;
use std::ops::Range;
use termdict::{TermDictionary, TermStreamer};
use Result;
use DocId;
use {Result, SkipResult};
fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>,
@@ -286,6 +288,14 @@ impl Weight for RangeWeight {
let doc_bitset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(doc_bitset)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader)?;
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
Ok(Explanation::new("RangeQuery", 1.0f32))
}
}
#[cfg(test)]

View File

@@ -1,5 +1,5 @@
use docset::{DocSet, SkipResult};
use query::Scorer;
use query::{Explanation, Scorer};
use DocId;
use Score;
@@ -28,11 +28,31 @@ impl TermScorer {
}
}
impl TermScorer {
pub fn term_freq(&self) -> u32 {
self.postings.term_freq()
}
pub fn fieldnorm_id(&self) -> u8 {
self.fieldnorm_reader.fieldnorm_id(self.doc())
}
pub fn explain(&self) -> Explanation {
let fieldnorm_id = self.fieldnorm_id();
let term_freq = self.term_freq();
self.similarity_weight.explain(fieldnorm_id, term_freq)
}
}
impl DocSet for TermScorer {
fn advance(&mut self) -> bool {
self.postings.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
}
fn doc(&self) -> DocId {
self.postings.doc()
}
@@ -40,17 +60,12 @@ impl DocSet for TermScorer {
fn size_hint(&self) -> u32 {
self.postings.size_hint()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
}
}
impl Scorer for TermScorer {
fn score(&mut self) -> Score {
let doc = self.doc();
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
self.similarity_weight
.score(fieldnorm_id, self.postings.term_freq())
let fieldnorm_id = self.fieldnorm_id();
let term_freq = self.term_freq();
self.similarity_weight.score(fieldnorm_id, term_freq)
}
}

View File

@@ -3,11 +3,13 @@ use core::SegmentReader;
use docset::DocSet;
use postings::SegmentPostings;
use query::bm25::BM25Weight;
use query::Scorer;
use query::explanation::does_not_match;
use query::Weight;
use query::{Explanation, Scorer};
use schema::IndexRecordOption;
use Result;
use DocId;
use Term;
use {Result, SkipResult};
pub struct TermWeight {
term: Term,
@@ -17,25 +19,16 @@ pub struct TermWeight {
impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
let similarity_weight = self.similarity_weight.clone();
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);
if let Some(segment_postings) = postings_opt {
Ok(Box::new(TermScorer::new(
segment_postings,
fieldnorm_reader,
similarity_weight,
)))
} else {
Ok(Box::new(TermScorer::new(
SegmentPostings::empty(),
fieldnorm_reader,
similarity_weight,
)))
let term_scorer = self.scorer_specialized(reader)?;
Ok(Box::new(term_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer_specialized(reader)?;
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
Ok(scorer.explain())
}
fn count(&self, reader: &SegmentReader) -> Result<u32> {
@@ -64,4 +57,26 @@ impl TermWeight {
similarity_weight,
}
}
fn scorer_specialized(&self, reader: &SegmentReader) -> Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
let similarity_weight = self.similarity_weight.clone();
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);
if let Some(segment_postings) = postings_opt {
Ok(TermScorer::new(
segment_postings,
fieldnorm_reader,
similarity_weight,
))
} else {
Ok(TermScorer::new(
SegmentPostings::empty(),
fieldnorm_reader,
similarity_weight,
))
}
}
}

View File

@@ -1,6 +1,7 @@
use super::Scorer;
use core::SegmentReader;
use Result;
use query::Explanation;
use {DocId, Result};
/// A Weight is the specialization of a Query
/// for a given set of segments.
@@ -11,6 +12,9 @@ pub trait Weight: Send + Sync + 'static {
/// See [`Query`](./trait.Query.html).
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
/// Returns an `Explanation` for the given document.
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation>;
/// Returns the number documents within the given `SegmentReader`.
fn count(&self, reader: &SegmentReader) -> Result<u32> {
let mut scorer = self.scorer(reader)?;

View File

@@ -16,7 +16,7 @@ const BLOCK_SIZE: usize = 16_384;
/// the store is written to disc as document as being added,
/// as opposed to when the segment is getting finalized.
///
/// The skip list index on the other hand, is build in memory.
/// The skip list index on the other hand, is built in memory.
///
pub struct StoreWriter {
doc: DocId,