Compare commits

..

12 Commits

Author SHA1 Message Date
Paul Masurel
8a3e520bfe Bumping bitpacking version 2019-06-26 09:54:11 +09:00
Antoine Catton
bcd7386fc5 Add crates.io shield to the README (#572) 2019-06-18 11:19:06 +09:00
Paul Masurel
c23a7c992b Closes #552 (#570)
The different handles to `SegmentMeta` are closed before calling gc on
end_merge.
2019-06-16 14:12:13 +09:00
Paul Masurel
2a88094ec4 Disabling travis on OSX (#571) 2019-06-16 14:12:01 +09:00
Paul Masurel
ca3cfddab4 adding cond (#568) 2019-06-16 11:59:26 +09:00
Paul Masurel
7bd9f9773b trying to fix doc upload (#567) 2019-06-16 11:22:51 +09:00
Paul Masurel
e2da92fcb5 Petr tik n510 clear index (#566)
* Enables clearing the index

Closes #510

* Adds an examples to clear and rebuild index

* Addressing code review

Moved the example from examples/ to docstring above `clear`

* Corrected minor typos and missed/duplicate words

* Added stamper.revert method to be used for rollback

Added type alias for Opstamp

Moved to AtomicU64 on stable rust (since 1.34)

* Change the method name and doc-string

* Remove rollback from delete_all_documents

test_add_then_delete_all_documents fails with --test-threads 2

* Passes all the tests with any number of test-threads

(ran locally 5 times)

* Addressed code review

Deleted comments with debug info
changed ReloadPolicy to Manual

* Removing useless garbage_collect call and updated CHANGELOG
2019-06-12 09:40:03 +09:00
petr-tik
876e1451c4 Resume uploading docs to gh-pages (#565)
* Fixes #546

Generate docs and upload them. Need GH_TOKEN env var to be set in travis settings

* Investigate what TRAVIS* env vars are set
2019-06-12 09:30:09 +09:00
dependabot-preview[bot]
a37d2f9777 Update winapi requirement from 0.2 to 0.3 (#537)
* Update winapi requirement from 0.2 to 0.3

Updates the requirements on [winapi](https://github.com/retep998/winapi-rs) to permit the latest version.
- [Release notes](https://github.com/retep998/winapi-rs/releases)
- [Commits](https://github.com/retep998/winapi-rs/commits/0.3.7)

Signed-off-by: dependabot[bot] <support@dependabot.com>

* Fixing upgrading winapi (hopefully).
2019-06-06 10:23:13 +09:00
Paul Masurel
4822940b19 Issue/36 (#559)
* Added explanation

* Explain

* Splitting weight and idf

* Added comments

Closes #36
2019-06-06 10:03:54 +09:00
Paul Masurel
d590f4c6b0 Comments for IndexMeta (#560) 2019-06-06 09:24:31 +09:00
Paul Masurel
edfa619519 Update .travis.yml 2019-05-29 16:45:56 +09:00
33 changed files with 808 additions and 418 deletions

View File

@@ -10,7 +10,7 @@ env:
global:
- CRATE_NAME=tantivy
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
@@ -38,12 +38,12 @@ matrix:
# Linux
#- env: TARGET=aarch64-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 UPLOAD_DOCS=1
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
# OSX
- env: TARGET=x86_64-apple-darwin
os: osx
#- env: TARGET=x86_64-apple-darwin
# os: osx
before_install:
- set -e
@@ -52,6 +52,7 @@ before_install:
install:
- sh ci/install.sh
- source ~/.cargo/env || true
- env | grep "TRAVIS"
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
@@ -61,12 +62,14 @@ before_script:
script:
- bash ci/script.sh
after_success:
- cargo doc-upload
before_deploy:
- sh ci/before_deploy.sh
after_success:
# Needs GH_TOKEN env var to be set in travis settings
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
cache: cargo
before_cache:
# Travis can't cache files that are not readable by "others"

View File

@@ -5,6 +5,9 @@ Tantivy 0.10.0
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
All segments are simply removed.
Minor
---------

View File

@@ -42,7 +42,7 @@ owning_ref = "0.4"
stable_deref_trait = "1.0.0"
rust-stemmers = "1.1"
downcast-rs = { version="1.0" }
bitpacking = "0.6"
bitpacking = "0.7"
census = "0.2"
fnv = "1.0.6"
owned-read = "0.4"
@@ -54,7 +54,7 @@ murmurhash32 = "0.2"
chrono = "0.4"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
winapi = "0.3"
[dev-dependencies]
rand = "0.6"

View File

@@ -4,6 +4,7 @@
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)

View File

@@ -537,4 +537,35 @@ mod tests {
}
assert_eq!(count, 2);
}
#[test]
fn garbage_collect_works_as_intended() {
let directory = RAMDirectory::create();
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema).unwrap();
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i));
}
writer.commit().unwrap();
let mem_right_after_commit = directory.total_mem_usage();
thread::sleep(Duration::from_millis(1_000));
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 8_000);
writer.wait_merging_threads().unwrap();
let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 8_000);
assert!(mem_right_after_merge_finished < mem_right_after_commit);
}
}

View File

@@ -14,14 +14,27 @@ use Opstamp;
///
#[derive(Clone, Serialize, Deserialize)]
pub struct IndexMeta {
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
pub segments: Vec<SegmentMeta>,
/// Index `Schema`
pub schema: Schema,
/// Opstamp associated to the last `commit` operation.
pub opstamp: Opstamp,
#[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit.
///
/// Upon commit, clients can optionally add a small `Striing` payload to their commit
/// to help identify this commit.
/// This payload is entirely unused by tantivy.
pub payload: Option<String>,
}
impl IndexMeta {
/// Create an `IndexMeta` object representing a brand new `Index`
/// with the given index.
///
/// This new index does not contains any segments.
/// Opstamp will the value `0u64`.
pub fn with_schema(schema: Schema) -> IndexMeta {
IndexMeta {
segments: vec![],

View File

@@ -246,7 +246,7 @@ impl SegmentReader {
let termdict_source = self
.termdict_composite
.open_read(field)
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
.expect("Failed to open field term dictionary in composite file. Is the field indexed?");
let positions_source = self
.positions_composite

View File

@@ -6,7 +6,7 @@ use std::path::PathBuf;
/// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)]
pub enum LockError {
/// Failed to acquired a lock as it is already hold by another
/// Failed to acquired a lock as it is already held by another
/// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.

View File

@@ -320,7 +320,7 @@ impl MmapDirectory {
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
use winapi::um::winbase;
open_opts
.write(true)

View File

@@ -103,6 +103,10 @@ impl InnerDirectory {
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
self.watch_router.subscribe(watch_handle)
}
fn total_mem_usage(&self) -> usize {
self.fs.values().map(|f| f.len()).sum()
}
}
impl fmt::Debug for RAMDirectory {
@@ -126,6 +130,12 @@ impl RAMDirectory {
pub fn create() -> RAMDirectory {
Self::default()
}
/// Returns the sum of the size of the different files
/// in the RAMDirectory.
pub fn total_mem_usage(&self) -> usize {
self.fs.read().unwrap().total_mem_usage()
}
}
impl Directory for RAMDirectory {

View File

@@ -332,7 +332,8 @@ fn index_documents(
}
impl IndexWriter {
/// The index writer
/// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> Result<()> {
// this will stop the indexing thread,
// dropping the last reference to the segment_updater.
@@ -383,7 +384,6 @@ impl IndexWriter {
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.operation_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
@@ -462,6 +462,52 @@ impl IndexWriter {
self.segment_updater.garbage_collect_files()
}
/// Deletes all documents from the index
///
/// Requires `commit`ing
/// Enables users to rebuild the index,
/// by clearing and resubmitting necessary documents
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::query::QueryParser;
/// use tantivy::collector::TopDocs;
/// use tantivy::schema::*;
/// use tantivy::Index;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT | STORED);
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema.clone());
///
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
/// index_writer.commit()?;
///
/// let clear_res = index_writer.delete_all_documents().unwrap();
/// // have to commit, otherwise deleted terms remain available
/// index_writer.commit()?;
///
/// let searcher = index.reader()?.searcher();
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query_promo = query_parser.parse_query("Promotheus")?;
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
///
/// assert!(top_docs_promo.is_empty());
/// Ok(())
/// }
/// ```
pub fn delete_all_documents(&mut self) -> Result<Opstamp> {
// Delete segments
self.segment_updater.remove_all_segments();
// Return new stamp - reverted stamp
self.stamper.revert(self.committed_opstamp);
Ok(self.committed_opstamp)
}
/// Merges a given list of segments
///
/// `segment_ids` is required to be non-empty.
@@ -489,19 +535,22 @@ impl IndexWriter {
/// Rollback to the last commit
///
/// This cancels all of the update that
/// happened before after the last commit.
/// This cancels all of the updates that
/// happened after the last commit.
/// After calling rollback, the index is in the same
/// state as it was after the last commit.
///
/// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> Result<Opstamp> {
info!("Rolling back to opstamp {}", self.committed_opstamp);
self.rollback_impl()
}
/// Private, implementation of rollback
fn rollback_impl(&mut self) -> Result<Opstamp> {
// marks the segment updater as killed. From now on, all
// segment updates will be ignored.
self.segment_updater.kill();
let document_receiver = self.operation_receiver.clone();
// take the directory lock to create a new index_writer.
@@ -1049,4 +1098,145 @@ mod tests {
assert_eq!(num_docs_containing("b"), 0);
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
}
#[test]
fn test_add_then_delete_all_documents() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| {
reader.reload().unwrap();
let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term)
};
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
let commit_tstamp = index_writer.commit().unwrap();
assert!(commit_tstamp > add_tstamp);
index_writer.delete_all_documents().unwrap();
index_writer.commit().unwrap();
// Search for documents with the same term that we added
assert_eq!(num_docs_containing("a"), 0);
}
#[test]
fn test_delete_all_documents_rollback_correct_stamp() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
// commit documents - they are now available
let first_commit = index_writer.commit();
assert!(first_commit.is_ok());
let first_commit_tstamp = first_commit.unwrap();
assert!(first_commit_tstamp > add_tstamp);
// delete_all_documents the index
let clear_tstamp = index_writer.delete_all_documents().unwrap();
assert_eq!(clear_tstamp, add_tstamp);
// commit the clear command - now documents aren't available
let second_commit = index_writer.commit();
assert!(second_commit.is_ok());
let second_commit_tstamp = second_commit.unwrap();
// add new documents again
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
}
// rollback to last commit, when index was empty
let rollback = index_writer.rollback();
assert!(rollback.is_ok());
let rollback_tstamp = rollback.unwrap();
assert_eq!(rollback_tstamp, second_commit_tstamp);
// working with an empty index == no documents
let term_b = Term::from_field_text(text_field, "b");
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0);
}
#[test]
fn test_delete_all_documents_then_add() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let res = index_writer.delete_all_documents();
assert!(res.is_ok());
assert!(index_writer.commit().is_ok());
// add one simple doc
index_writer.add_document(doc!(text_field => "a"));
assert!(index_writer.commit().is_ok());
let term_a = Term::from_field_text(text_field, "a");
// expect the document with that term to be in the index
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
}
#[test]
fn test_delete_all_documents_and_rollback() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// add one simple doc
index_writer.add_document(doc!(text_field => "a"));
let comm = index_writer.commit();
assert!(comm.is_ok());
let commit_tstamp = comm.unwrap();
// clear but don't commit!
let clear_tstamp = index_writer.delete_all_documents().unwrap();
// clear_tstamp should reset to before the last commit
assert!(clear_tstamp < commit_tstamp);
// rollback
let _rollback_tstamp = index_writer.rollback().unwrap();
// Find original docs in the index
let term_a = Term::from_field_text(text_field, "a");
// expect the document with that term to be in the index
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
}
#[test]
fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let clear = index_writer.delete_all_documents();
let commit = index_writer.commit();
assert!(clear.is_ok());
assert!(commit.is_ok());
}
#[test]
fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let clear = index_writer.delete_all_documents();
let commit = index_writer.commit();
assert!(clear.is_ok());
assert!(commit.is_ok());
let clear_again = index_writer.delete_all_documents();
let commit_again = index_writer.commit();
assert!(clear_again.is_ok());
assert!(commit_again.is_ok());
}
}

View File

@@ -118,6 +118,12 @@ impl SegmentManager {
});
}
pub(crate) fn remove_all_segments(&self) {
let mut registers_lock = self.write();
registers_lock.committed.clear();
registers_lock.uncommitted.clear();
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
registers_lock.committed.clear();

View File

@@ -42,9 +42,9 @@ use Result;
/// Save the index meta file.
/// This operation is atomic :
/// Either
// - it fails, in which case an error is returned,
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
@@ -70,6 +70,7 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
///
/// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
@@ -213,6 +214,11 @@ impl SegmentUpdater {
}
}
/// Orders `SegmentManager` to remove all segments
pub(crate) fn remove_all_segments(&self) {
self.0.segment_manager.remove_all_segments();
}
pub fn kill(&mut self) {
self.0.killed.store(true, Ordering::Release);
}
@@ -223,7 +229,7 @@ impl SegmentUpdater {
/// Apply deletes up to the target opstamp to all segments.
///
/// Tne method returns copies of the segment entries,
/// The method returns copies of the segment entries,
/// updated with the delete information.
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
let mut segment_entries = self.0.segment_manager.segment_entries();
@@ -446,38 +452,41 @@ impl SegmentUpdater {
) -> Result<()> {
self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(e) =
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
{
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
);
if cfg!(test) {
panic!("Merge failed.");
{
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(e) = advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
);
if cfg!(test) {
panic!("Merge failed.");
}
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
return;
}
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
return;
}
}
}
segment_updater
.0
.segment_manager
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
let previous_metas = segment_updater.load_metas();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
let previous_metas = segment_updater.load_metas();
segment_updater
.0
.segment_manager
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
segment_updater.consider_merge_options();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
} // we drop all possible handle to a now useless `SegmentMeta`.
segment_updater.garbage_collect_files_exec();
})
.wait()
@@ -651,4 +660,31 @@ mod tests {
assert!(index.searchable_segment_metas().unwrap().is_empty());
assert!(reader.searcher().segment_readers().is_empty());
}
#[test]
fn test_remove_all_segments() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
}
index_writer.segment_updater().remove_all_segments();
let seg_vec = index_writer
.segment_updater()
.0
.segment_manager
.segment_entries();
assert!(seg_vec.is_empty());
}
}

View File

@@ -20,7 +20,7 @@ use Opstamp;
use Result;
/// A `SegmentWriter` is in charge of creating segment index from a
/// documents.
/// set of documents.
///
/// They creates the postings list in anonymous memory.
/// The segment is layed on disk when the segment gets `finalized`.

View File

@@ -28,6 +28,12 @@ impl Stamper {
end: start + n,
}
}
/// Reverts the stamper to a given `Opstamp` value and returns it
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
self.0.store(to_opstamp, Ordering::SeqCst);
to_opstamp
}
}
#[cfg(test)]
@@ -50,4 +56,17 @@ mod test {
assert_eq!(stamper.stamp(), 15u64);
}
#[test]
fn test_stamper_revert() {
let stamper = Stamper::new(7u64);
assert_eq!(stamper.stamp(), 7u64);
assert_eq!(stamper.stamp(), 8u64);
let stamper_clone = stamper.clone();
assert_eq!(stamper_clone.stamp(), 9u64);
stamper.revert(6);
assert_eq!(stamper.stamp(), 6);
assert_eq!(stamper_clone.stamp(), 7);
}
}

View File

@@ -226,7 +226,7 @@ mod docset;
pub use self::docset::{DocSet, SkipResult};
pub use core::SegmentComponent;
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta, IndexMeta};
pub use core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
pub use core::{InvertedIndexReader, SegmentReader};
pub use directory::Directory;
pub use indexer::IndexWriter;

View File

@@ -1,7 +1,8 @@
use core::Searcher;
use core::SegmentReader;
use docset::DocSet;
use query::{Query, Scorer, Weight};
use query::explanation::does_not_match;
use query::{Explanation, Query, Scorer, Weight};
use DocId;
use Result;
use Score;
@@ -29,6 +30,13 @@ impl Weight for AllWeight {
max_doc: reader.max_doc(),
}))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
if doc >= reader.max_doc() {
return Err(does_not_match(doc));
}
Ok(Explanation::new("AllQuery", 1f32))
}
}
enum State {

View File

@@ -1,12 +1,14 @@
use common::BitSet;
use core::SegmentReader;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{BitSetDocSet, Explanation};
use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use tantivy_fst::Automaton;
use termdict::{TermDictionary, TermStreamer};
use Result;
use DocId;
use TantivyError;
use {Result, SkipResult};
/// A weight struct for Fuzzy Term and Regex Queries
pub struct AutomatonWeight<A>
@@ -56,4 +58,15 @@ where
let doc_bitset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(doc_bitset)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader)?;
if scorer.skip_next(doc) == SkipResult::Reached {
Ok(Explanation::new("AutomatonScorer", 1.0f32))
} else {
Err(TantivyError::InvalidArgument(
"Document does not exist".to_string(),
))
}
}
}

View File

@@ -1,4 +1,5 @@
use fieldnorm::FieldNormReader;
use query::Explanation;
use Score;
use Searcher;
use Term;
@@ -26,18 +27,13 @@ fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
#[derive(Clone)]
pub struct BM25Weight {
idf_explain: Explanation,
weight: f32,
cache: [f32; 256],
average_fieldnorm: f32,
}
impl BM25Weight {
pub fn null() -> BM25Weight {
BM25Weight {
weight: 0f32,
cache: [1f32; 256],
}
}
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
assert!(!terms.is_empty(), "BM25 requires at least one term");
let field = terms[0].field();
@@ -58,20 +54,37 @@ impl BM25Weight {
}
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
let idf = terms
.iter()
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
BM25Weight::new(idf, average_fieldnorm)
let mut idf_explain: Explanation;
if terms.len() == 1 {
let term_doc_freq = searcher.doc_freq(&terms[0]);
let idf = idf(term_doc_freq, total_num_docs);
idf_explain =
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
idf_explain.add_const(
"n, number of docs containing this term",
term_doc_freq as f32,
);
idf_explain.add_const("N, total number of docs", total_num_docs as f32);
} else {
let idf = terms
.iter()
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
idf_explain = Explanation::new("idf", idf);
}
BM25Weight::new(idf_explain, average_fieldnorm)
}
fn new(idf: f32, average_fieldnorm: f32) -> BM25Weight {
fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight {
let weight = idf_explain.value() * (1f32 + K1);
BM25Weight {
weight: idf * (1f32 + K1),
idf_explain,
weight,
cache: compute_tf_cache(average_fieldnorm),
average_fieldnorm,
}
}
@@ -81,6 +94,37 @@ impl BM25Weight {
let term_freq = term_freq as f32;
self.weight * term_freq / (term_freq + norm)
}
pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation {
// The explain format is directly copied from Lucene's.
// (So, Kudos to Lucene)
let score = self.score(fieldnorm_id, term_freq);
let norm = self.cache[fieldnorm_id as usize];
let term_freq = term_freq as f32;
let right_factor = term_freq / (term_freq + norm);
let mut tf_explanation = Explanation::new(
"freq / (freq + k1 * (1 - b + b * dl / avgdl))",
right_factor,
);
tf_explanation.add_const("freq, occurrences of term within document", term_freq);
tf_explanation.add_const("k1, term saturation parameter", K1);
tf_explanation.add_const("b, length normalization parameter", B);
tf_explanation.add_const(
"dl, length of field",
FieldNormReader::id_to_fieldnorm(fieldnorm_id) as f32,
);
tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm);
let mut explanation = Explanation::new("TermQuery, product of...", score);
explanation.add_detail(Explanation::new("(K1+1)", K1 + 1f32));
explanation.add_detail(self.idf_explain.clone());
explanation.add_detail(tf_explanation);
explanation
}
}
#[cfg(test)]

View File

@@ -1,5 +1,5 @@
use core::SegmentReader;
use query::intersect_scorers;
use query::explanation::does_not_match;
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use query::term_query::TermScorer;
use query::EmptyScorer;
@@ -9,8 +9,10 @@ use query::RequiredOptionalScorer;
use query::Scorer;
use query::Union;
use query::Weight;
use query::{intersect_scorers, Explanation};
use std::collections::HashMap;
use Result;
use {DocId, SkipResult};
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
where
@@ -50,10 +52,10 @@ impl BooleanWeight {
}
}
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
fn per_occur_scorers(
&self,
reader: &SegmentReader,
) -> Result<Box<Scorer>> {
) -> Result<HashMap<Occur, Vec<Box<Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<Scorer>>> = HashMap::new();
for &(ref occur, ref subweight) in &self.weights {
let sub_scorer: Box<Scorer> = subweight.scorer(reader)?;
@@ -62,6 +64,14 @@ impl BooleanWeight {
.or_insert_with(Vec::new)
.push(sub_scorer);
}
Ok(per_occur_scorers)
}
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
&self,
reader: &SegmentReader,
) -> Result<Box<Scorer>> {
let mut per_occur_scorers = self.per_occur_scorers(reader)?;
let should_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
.remove(&Occur::Should)
@@ -118,4 +128,31 @@ impl Weight for BooleanWeight {
self.complex_scorer::<DoNothingCombiner>(reader)
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader)?;
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
if !self.scoring_enabled {
return Ok(Explanation::new("BooleanQuery with no scoring", 1f32));
}
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
for &(ref occur, ref subweight) in &self.weights {
if is_positive_occur(*occur) {
if let Ok(child_explanation) = subweight.explain(reader, doc) {
explanation.add_detail(child_explanation);
}
}
}
Ok(explanation)
}
}
fn is_positive_occur(occur: Occur) -> bool {
match occur {
Occur::Must | Occur::Should => true,
Occur::MustNot => false,
}
}

View File

@@ -18,8 +18,8 @@ mod tests {
use query::Scorer;
use query::TermQuery;
use schema::*;
use DocId;
use Index;
use {DocAddress, DocId};
fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder();
@@ -206,240 +206,6 @@ mod tests {
}
}
/*
DoC 0
{
"_index": "test",
"_type": "_doc",
"_id": "0",
"matched": true,
"explanation": {
"value": 6.2610235,
"description": "max of:",
"details": [{
"value": 6.1969156,
"description": "sum of:",
"details": [{
"value": 6.1969156,
"description": "weight(text:оксана in 561) [PerFieldSimilarity], result of:",
"details": [{
"value": 6.1969156,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 5.65998,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 3,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.49766606,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 19.0,
"description": "dl, length of field",
"details": []
}, {
"value": 24.105577,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}]
}, {
"value": 6.2610235,
"description": "sum of:",
"details": [{
"value": 6.2610235,
"description": "weight(title:оксана in 561) [PerFieldSimilarity], result of:",
"details": [{
"value": 6.2610235,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 5.4086657,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 4,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.52617776,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 4.0,
"description": "dl, length of field",
"details": []
}, {
"value": 5.99502,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}]
}]
}
}
doc 2
{
"_index": "test",
"_type": "_doc",
"_id": "2",
"matched": true,
"explanation": {
"value": 11.911896,
"description": "max of:",
"details": [{
"value": 11.911896,
"description": "sum of:",
"details": [{
"value": 5.4068284,
"description": "weight(title:оксана in 0) [PerFieldSimilarity], result of:",
"details": [{
"value": 5.4068284,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 5.4086657,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 4,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.45439103,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 6.0,
"description": "dl, length of field",
"details": []
}, {
"value": 5.99502,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}, {
"value": 6.505067,
"description": "weight(title:лифенко in 0) [PerFieldSimilarity], result of:",
"details": [{
"value": 6.505067,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 6.5072775,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 1,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.45439103,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 6.0,
"description": "dl, length of field",
"details": []
}, {
"value": 5.99502,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}]
}]
}
}
*/
// motivated by #554
#[test]
fn test_bm25_several_fields() {
@@ -483,54 +249,123 @@ mod tests {
.unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
// let mut scores = vec![];
// while
println!("=====|");
scorer.advance();
dbg!("scorer.score()");
assert!(false);
// scores.push(scorer.score());
// assert_eq!(scores, &[0.8017307, 0.72233325, 1.0300813]);
let explanation = query.explain(&searcher, DocAddress(0u32, 0u32)).unwrap();
assert_eq!(
explanation.to_pretty_json(),
r#"{
"value": 12.997711,
"description": "BooleanClause. Sum of ...",
"details": [
{
"value": 12.997711,
"description": "BooleanClause. Sum of ...",
"details": [
{
"value": 6.551476,
"description": "TermQuery, product of...",
"details": [
{
"value": 2.2,
"description": "(K1+1)"
},
{
"value": 5.658984,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
"details": [
{
"value": 3.0,
"description": "n, number of docs containing this term"
},
{
"value": 1003.0,
"description": "N, total number of docs"
}
]
},
{
"value": 0.5262329,
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
"details": [
{
"value": 1.0,
"description": "freq, occurrences of term within document"
},
{
"value": 1.2,
"description": "k1, term saturation parameter"
},
{
"value": 0.75,
"description": "b, length normalization parameter"
},
{
"value": 4.0,
"description": "dl, length of field"
},
{
"value": 5.997009,
"description": "avgdl, average length of field"
}
]
}
]
},
{
"value": 6.446235,
"description": "TermQuery, product of...",
"details": [
{
"value": 2.2,
"description": "(K1+1)"
},
{
"value": 5.9954567,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
"details": [
{
"value": 2.0,
"description": "n, number of docs containing this term"
},
{
"value": 1003.0,
"description": "N, total number of docs"
}
]
},
{
"value": 0.4887212,
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
"details": [
{
"value": 1.0,
"description": "freq, occurrences of term within document"
},
{
"value": 1.2,
"description": "k1, term saturation parameter"
},
{
"value": 0.75,
"description": "b, length normalization parameter"
},
{
"value": 20.0,
"description": "dl, length of field"
},
{
"value": 24.123629,
"description": "avgdl, average length of field"
}
]
}
]
}
]
}
// motivated by #554
#[test]
fn test_bm25_several_fields_bbb() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
));
index_writer.add_document(doc!(
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
));
index_writer.add_document(doc!(
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
));
for _ in 0..100 {
index_writer.add_document(doc!(
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser
.parse_query("Оксана Лифенко")
.unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
let mut scores = vec![];
while scorer.advance() {
scores.push(scorer.score());
}
assert_eq!(scores, &[0.8017307, 0.72233325, 1.0300813]);
index_writer.commit().unwrap();
]
}"#
);
}
}

View File

@@ -1,6 +1,7 @@
use super::Scorer;
use query::Query;
use query::explanation::does_not_match;
use query::Weight;
use query::{Explanation, Query};
use DocId;
use DocSet;
use Result;
@@ -32,6 +33,10 @@ impl Weight for EmptyWeight {
fn scorer(&self, _reader: &SegmentReader) -> Result<Box<Scorer>> {
Ok(Box::new(EmptyScorer))
}
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
Err(does_not_match(doc))
}
}
/// `EmptyScorer` is a dummy `Scorer` in which no document matches.

51
src/query/explanation.rs Normal file
View File

@@ -0,0 +1,51 @@
use {DocId, TantivyError};
pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
}
/// Object describing the score of a given document.
/// It is organized in trees.
///
/// `.to_pretty_json()` can be useful to print out a human readable
/// representation of this tree when debugging a given score.
#[derive(Clone, Serialize)]
pub struct Explanation {
value: f32,
description: String,
#[serde(skip_serializing_if = "Vec::is_empty")]
details: Vec<Explanation>,
}
impl Explanation {
/// Creates a new explanation object.
pub fn new<T: ToString>(description: T, value: f32) -> Explanation {
Explanation {
value,
description: description.to_string(),
details: vec![],
}
}
/// Returns the value associated to the current node.
pub fn value(&self) -> f32 {
self.value
}
/// Add some detail, explaining some part of the current node formula.
///
/// Details are treated as child of the current node.
pub fn add_detail(&mut self, child_explanation: Explanation) {
self.details.push(child_explanation);
}
/// Shortcut for `self.details.push(Explanation::new(name, value));`
pub fn add_const<T: ToString>(&mut self, name: T, value: f32) {
self.details.push(Explanation::new(name, value));
}
/// Returns an indented json representation of the explanation tree for debug usage.
pub fn to_pretty_json(&self) -> String {
serde_json::to_string_pretty(self).unwrap()
}
}

View File

@@ -9,6 +9,7 @@ mod bm25;
mod boolean_query;
mod empty_query;
mod exclude;
mod explanation;
mod fuzzy_query;
mod intersection;
mod occur;
@@ -39,6 +40,7 @@ pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::explanation::Explanation;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::occur::Occur;

View File

@@ -93,21 +93,12 @@ impl Query for PhraseQuery {
field_name
)));
}
if scoring_enabled {
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
Ok(Box::new(PhraseWeight::new(
self.phrase_terms.clone(),
bm25_weight,
true,
)))
} else {
Ok(Box::new(PhraseWeight::new(
self.phrase_terms.clone(),
BM25Weight::null(),
false,
)))
}
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
let phrase_weight: PhraseWeight =
PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
Ok(Box::new(phrase_weight))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {

View File

@@ -148,9 +148,13 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
}
pub fn phrase_count(&self) -> u32 {
self.phrase_count
}
fn phrase_match(&mut self) -> bool {
if self.score_needed {
let count = self.phrase_count();
let count = self.compute_phrase_count();
self.phrase_count = count;
count > 0u32
} else {
@@ -183,7 +187,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
fn phrase_count(&mut self) -> u32 {
fn compute_phrase_count(&mut self) -> u32 {
{
self.intersection_docset
.docset_mut_specialized(0)

View File

@@ -1,12 +1,16 @@
use super::PhraseScorer;
use core::SegmentReader;
use fieldnorm::FieldNormReader;
use postings::SegmentPostings;
use query::bm25::BM25Weight;
use query::EmptyScorer;
use query::explanation::does_not_match;
use query::Scorer;
use query::Weight;
use query::{EmptyScorer, Explanation};
use schema::IndexRecordOption;
use schema::Term;
use Result;
use {DocId, DocSet};
use {Result, SkipResult};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -27,13 +31,18 @@ impl PhraseWeight {
score_needed,
}
}
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let similarity_weight = self.similarity_weight.clone();
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
let field = self.phrase_terms[0].1.field();
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
reader.get_fieldnorms_reader(field)
}
fn phrase_scorer(
&self,
reader: &SegmentReader,
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.clone();
let fieldnorm_reader = self.fieldnorm_reader(reader);
if reader.has_deletes() {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
@@ -43,10 +52,10 @@ impl Weight for PhraseWeight {
{
term_postings_list.push((offset, postings));
} else {
return Ok(Box::new(EmptyScorer));
return Ok(None);
}
}
Ok(Box::new(PhraseScorer::new(
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
@@ -61,10 +70,10 @@ impl Weight for PhraseWeight {
{
term_postings_list.push((offset, postings));
} else {
return Ok(Box::new(EmptyScorer));
return Ok(None);
}
}
Ok(Box::new(PhraseScorer::new(
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
@@ -73,3 +82,30 @@ impl Weight for PhraseWeight {
}
}
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader)? {
Ok(Box::new(scorer))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
}
let mut scorer = scorer_opt.unwrap();
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
explanation.add_detail(self.similarity_weight.explain(fieldnorm_id, phrase_count));
Ok(explanation)
}
}

View File

@@ -1,10 +1,11 @@
use super::Weight;
use core::searcher::Searcher;
use downcast_rs;
use query::Explanation;
use std::collections::BTreeSet;
use std::fmt;
use Result;
use Term;
use {downcast_rs, DocAddress};
/// The `Query` trait defines a set of documents and a scoring method
/// for those documents.
@@ -48,6 +49,13 @@ pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
/// See [`Weight`](./trait.Weight.html).
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>>;
/// Returns an `Explanation` for the score of the document.
fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> Result<Explanation> {
let reader = searcher.segment_reader(doc_address.segment_ord());
let weight = self.weight(searcher, true)?;
weight.explain(reader, doc_address.doc())
}
/// Returns the number of documents matching the query.
fn count(&self, searcher: &Searcher) -> Result<usize> {
let weight = self.weight(searcher, false)?;

View File

@@ -2,15 +2,17 @@ use common::BitSet;
use core::Searcher;
use core::SegmentReader;
use error::TantivyError;
use query::BitSetDocSet;
use query::explanation::does_not_match;
use query::ConstScorer;
use query::{BitSetDocSet, Explanation};
use query::{Query, Scorer, Weight};
use schema::Type;
use schema::{Field, IndexRecordOption, Term};
use std::collections::Bound;
use std::ops::Range;
use termdict::{TermDictionary, TermStreamer};
use Result;
use DocId;
use {Result, SkipResult};
fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>,
@@ -286,6 +288,14 @@ impl Weight for RangeWeight {
let doc_bitset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(doc_bitset)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader)?;
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
Ok(Explanation::new("RangeQuery", 1.0f32))
}
}
#[cfg(test)]

View File

@@ -1,5 +1,5 @@
use docset::{DocSet, SkipResult};
use query::Scorer;
use query::{Explanation, Scorer};
use DocId;
use Score;
@@ -28,11 +28,31 @@ impl TermScorer {
}
}
impl TermScorer {
pub fn term_freq(&self) -> u32 {
self.postings.term_freq()
}
pub fn fieldnorm_id(&self) -> u8 {
self.fieldnorm_reader.fieldnorm_id(self.doc())
}
pub fn explain(&self) -> Explanation {
let fieldnorm_id = self.fieldnorm_id();
let term_freq = self.term_freq();
self.similarity_weight.explain(fieldnorm_id, term_freq)
}
}
impl DocSet for TermScorer {
fn advance(&mut self) -> bool {
self.postings.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
}
fn doc(&self) -> DocId {
self.postings.doc()
}
@@ -40,17 +60,12 @@ impl DocSet for TermScorer {
fn size_hint(&self) -> u32 {
self.postings.size_hint()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
}
}
impl Scorer for TermScorer {
fn score(&mut self) -> Score {
let doc = self.doc();
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
self.similarity_weight
.score(fieldnorm_id, self.postings.term_freq())
let fieldnorm_id = self.fieldnorm_id();
let term_freq = self.term_freq();
self.similarity_weight.score(fieldnorm_id, term_freq)
}
}

View File

@@ -3,11 +3,13 @@ use core::SegmentReader;
use docset::DocSet;
use postings::SegmentPostings;
use query::bm25::BM25Weight;
use query::Scorer;
use query::explanation::does_not_match;
use query::Weight;
use query::{Explanation, Scorer};
use schema::IndexRecordOption;
use Result;
use DocId;
use Term;
use {Result, SkipResult};
pub struct TermWeight {
term: Term,
@@ -17,25 +19,16 @@ pub struct TermWeight {
impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
let similarity_weight = self.similarity_weight.clone();
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);
if let Some(segment_postings) = postings_opt {
Ok(Box::new(TermScorer::new(
segment_postings,
fieldnorm_reader,
similarity_weight,
)))
} else {
Ok(Box::new(TermScorer::new(
SegmentPostings::empty(),
fieldnorm_reader,
similarity_weight,
)))
let term_scorer = self.scorer_specialized(reader)?;
Ok(Box::new(term_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer_specialized(reader)?;
if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc));
}
Ok(scorer.explain())
}
fn count(&self, reader: &SegmentReader) -> Result<u32> {
@@ -64,4 +57,26 @@ impl TermWeight {
similarity_weight,
}
}
fn scorer_specialized(&self, reader: &SegmentReader) -> Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
let similarity_weight = self.similarity_weight.clone();
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);
if let Some(segment_postings) = postings_opt {
Ok(TermScorer::new(
segment_postings,
fieldnorm_reader,
similarity_weight,
))
} else {
Ok(TermScorer::new(
SegmentPostings::empty(),
fieldnorm_reader,
similarity_weight,
))
}
}
}

View File

@@ -1,6 +1,7 @@
use super::Scorer;
use core::SegmentReader;
use Result;
use query::Explanation;
use {DocId, Result};
/// A Weight is the specialization of a Query
/// for a given set of segments.
@@ -11,6 +12,9 @@ pub trait Weight: Send + Sync + 'static {
/// See [`Query`](./trait.Query.html).
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
/// Returns an `Explanation` for the given document.
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation>;
/// Returns the number documents within the given `SegmentReader`.
fn count(&self, reader: &SegmentReader) -> Result<u32> {
let mut scorer = self.scorer(reader)?;

View File

@@ -16,7 +16,7 @@ const BLOCK_SIZE: usize = 16_384;
/// the store is written to disc as document as being added,
/// as opposed to when the segment is getting finalized.
///
/// The skip list index on the other hand, is build in memory.
/// The skip list index on the other hand, is built in memory.
///
pub struct StoreWriter {
doc: DocId,