Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
dce6adc4b6 Revert "add index accessor for index writer (#1159)"
This reverts commit b256df6599.
2021-09-23 21:49:34 +09:00
150 changed files with 3087 additions and 5881 deletions

View File

@@ -6,10 +6,3 @@ updates:
interval: daily interval: daily
time: "20:00" time: "20:00"
open-pull-requests-limit: 10 open-pull-requests-limit: 10
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: daily
time: "20:00"
open-pull-requests-limit: 10

View File

@@ -18,7 +18,7 @@ jobs:
- name: Generate code coverage - name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v2 uses: codecov/codecov-action@v1
with: with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info files: lcov.info

View File

@@ -21,10 +21,10 @@ jobs:
- name: Install latest nightly to test also against unstable feature flag - name: Install latest nightly to test also against unstable feature flag
uses: actions-rs/toolchain@v1 uses: actions-rs/toolchain@v1
with: with:
toolchain: stable toolchain: nightly
override: true override: true
components: rustfmt components: rustfmt
- name: Run tests - name: Run tests
run: cargo test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace run: cargo test --all-features --verbose --workspace
- name: Check Formatting - name: Check Formatting
run: cargo fmt --all -- --check run: cargo fmt --all -- --check

1
.gitignore vendored
View File

@@ -1,5 +1,4 @@
tantivy.iml tantivy.iml
.cargo
proptest-regressions proptest-regressions
*.swp *.swp
target target

92
.travis.yml Normal file
View File

@@ -0,0 +1,92 @@
# Based on the "trust" template v0.1.2
# https://github.com/japaric/trust/tree/v0.1.2
dist: trusty
language: rust
services: docker
sudo: required
env:
global:
- CRATE_NAME=tantivy
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- kalakris-cmake
packages:
- gcc-4.8
- g++-4.8
- libcurl4-openssl-dev
- libelf-dev
- libdw-dev
- binutils-dev
- cmake
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
# Linux
#- env: TARGET=aarch64-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
# OSX
#- env: TARGET=x86_64-apple-darwin
# os: osx
before_install:
- set -e
- rustup self update
- rustup component add rustfmt
install:
- sh ci/install.sh
- source ~/.cargo/env || true
- env | grep "TRAVIS"
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
- cargo install cargo-update || echo "cargo-update already installed"
- cargo install cargo-travis || echo "cargo-travis already installed"
script:
- bash ci/script.sh
- cargo fmt --all -- --check
before_deploy:
- sh ci/before_deploy.sh
after_success:
# Needs GH_TOKEN env var to be set in travis settings
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
#cache: cargo
#before_cache:
# # Travis can't cache files that are not readable by "others"
# - chmod -R a+r $HOME/.cargo
# - find ./target/debug -type f -maxdepth 1 -delete
# - rm -f ./target/.rustc_info.json
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
# - rm -r target/debug/examples/
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
#branches:
# only:
# # release tags
# - /^v\d+\.\d+\.\d+.*$/
# - master
notifications:
email:
on_success: never

View File

@@ -1,21 +1,6 @@
Tantivy 0.17
================================
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar) [#115](https://github.com/quickwit-inc/tantivy/issues/115)
- Adds a searcher Warmer API (@shikhar)
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 .
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224)
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-inc/tantivy/issues/922)
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225)
Tantivy 0.16.2
================================
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
Tantivy 0.16.1 Tantivy 0.16.1
======================== ========================
- Major Bugfix on multivalued fastfield. #1151 - Major Bugfix on multivalued fastfield. #1151
- Demux operation (@PSeitz)
Tantivy 0.16.0 Tantivy 0.16.0
========================= =========================
@@ -128,7 +113,7 @@ Tantivy 0.12.0
## How to update? ## How to update?
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
minor changes. Check https://github.com/quickwit-inc/tantivy/blob/main/examples/custom_tokenizer.rs minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
to check for some code sample. to check for some code sample.
Tantivy 0.11.3 Tantivy 0.11.3

View File

@@ -1,13 +1,13 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.17.0-dev" version = "0.16.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
description = """Search engine library""" description = """Search engine library"""
documentation = "https://docs.rs/tantivy/" documentation = "https://docs.rs/tantivy/"
homepage = "https://github.com/quickwit-inc/tantivy" homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/quickwit-inc/tantivy" repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"
@@ -20,7 +20,7 @@ once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] } regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3" tantivy-fst = "0.3"
memmap2 = {version = "0.5", optional=true} memmap2 = {version = "0.5", optional=true}
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true } lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true } brotli = { version = "3.3", optional = true }
snap = { version = "1.0.5", optional = true } snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.2", optional = true } tempfile = { version = "3.2", optional = true }
@@ -37,7 +37,7 @@ tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" } tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version = "0.1", path = "./common/", package = "tantivy-common" } common = { version = "0.1", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false } fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.2", path="./ownedbytes" } ownedbytes = { version="0.1", path="./ownedbytes" }
stable_deref_trait = "1.2" stable_deref_trait = "1.2"
rust-stemmers = "1.2" rust-stemmers = "1.2"
downcast-rs = "1.2" downcast-rs = "1.2"
@@ -46,15 +46,15 @@ census = "0.4"
fnv = "1.0.7" fnv = "1.0.7"
thiserror = "1.0.24" thiserror = "1.0.24"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = "0.5" fail = "0.4"
murmurhash32 = "0.2" murmurhash32 = "0.2"
chrono = "0.4.19" chrono = "0.4.19"
smallvec = "1.6.1" smallvec = "1.6.1"
rayon = "1.5" rayon = "1.5"
lru = "0.7.0" lru = "0.6.5"
fastdivide = "0.3" fastdivide = "0.3"
itertools = "0.10.0" itertools = "0.10.0"
measure_time = "0.8.0" measure_time = "0.7.0"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
@@ -65,11 +65,11 @@ maplit = "1.0.2"
matches = "0.1.8" matches = "0.1.8"
proptest = "1.0" proptest = "1.0"
criterion = "0.3.5" criterion = "0.3.5"
test-log = "0.2.8" test-env-log = "0.2.7"
env_logger = "0.9.0" env_logger = "0.9.0"
[dev-dependencies.fail] [dev-dependencies.fail]
version = "0.5" version = "0.4"
features = ["failpoints"] features = ["failpoints"]
[profile.release] [profile.release]
@@ -91,6 +91,7 @@ snappy-compression = ["snap"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace] [workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"] members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]

View File

@@ -1,8 +1,8 @@
[![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/) [![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/)
[![Build Status](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml) [![Build Status](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml)
[![codecov](https://codecov.io/gh/quickwit-inc/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/quickwit-inc/tantivy) [![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Join the chat at https://discord.gg/MT27AG5EVE](https://shields.io/discord/908281611840282624?label=chat%20on%20discord)](https://discord.gg/MT27AG5EVE) [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy) [![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
@@ -17,6 +17,9 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6) [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7) [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in Rust. **Tantivy** is a **full text search engine library** written in Rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
@@ -75,12 +78,13 @@ It walks you through getting a wikipedia search engine up and running in a few m
There are many ways to support this project. There are many ways to support this project.
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com) - Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Report bugs - Report bugs
- Write a blog post - Write a blog post
- Help with documentation by asking questions or submitting PRs - Help with documentation by asking questions or submitting PRs
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE)) - Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
- Talk about Tantivy around you - Talk about Tantivy around you
- [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
# Contributing code # Contributing code
@@ -92,7 +96,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run: To check out and run tests, you can simply run:
```bash ```bash
git clone https://github.com/quickwit-inc/tantivy.git git clone https://github.com/tantivy-search/tantivy.git
cd tantivy cd tantivy
cargo build cargo build
``` ```

View File

@@ -6,7 +6,7 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = [] categories = []
description = """Tantivy-sub crate: bitpacking""" description = """Tantivy-sub crate: bitpacking"""
repository = "https://github.com/quickwit-inc/tantivy" repository = "https://github.com/tantivy-search/tantivy"
keywords = [] keywords = []

View File

@@ -10,7 +10,6 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies] [dependencies]
byteorder = "1.4.3" byteorder = "1.4.3"
ownedbytes = { version="0.2", path="../ownedbytes" }
[dev-dependencies] [dev-dependencies]
proptest = "1.0.0" proptest = "1.0.0"

View File

@@ -1,8 +1,5 @@
use ownedbytes::OwnedBytes; use std::fmt;
use std::convert::TryInto;
use std::io::Write;
use std::u64; use std::u64;
use std::{fmt, io};
#[derive(Clone, Copy, Eq, PartialEq)] #[derive(Clone, Copy, Eq, PartialEq)]
pub struct TinySet(u64); pub struct TinySet(u64);
@@ -17,7 +14,6 @@ pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator { impl Iterator for TinySetIterator {
type Item = u32; type Item = u32;
#[inline]
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest() self.0.pop_lowest()
} }
@@ -32,60 +28,31 @@ impl IntoIterator for TinySet {
} }
impl TinySet { impl TinySet {
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.0.to_le_bytes().as_ref())
}
pub fn into_bytes(self) -> [u8; 8] {
self.0.to_le_bytes()
}
#[inline]
pub fn deserialize(data: [u8; 8]) -> Self {
let val: u64 = u64::from_le_bytes(data);
TinySet(val)
}
/// Returns an empty `TinySet`. /// Returns an empty `TinySet`.
#[inline]
pub fn empty() -> TinySet { pub fn empty() -> TinySet {
TinySet(0u64) TinySet(0u64)
} }
/// Returns a full `TinySet`.
#[inline]
pub fn full() -> TinySet {
TinySet::empty().complement()
}
pub fn clear(&mut self) { pub fn clear(&mut self) {
self.0 = 0u64; self.0 = 0u64;
} }
/// Returns the complement of the set in `[0, 64[`. /// Returns the complement of the set in `[0, 64[`.
///
/// Careful on making this function public, as it will break the padding handling in the last
/// bucket.
#[inline]
fn complement(self) -> TinySet { fn complement(self) -> TinySet {
TinySet(!self.0) TinySet(!self.0)
} }
/// Returns true iff the `TinySet` contains the element `el`. /// Returns true iff the `TinySet` contains the element `el`.
#[inline]
pub fn contains(self, el: u32) -> bool { pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty() !self.intersect(TinySet::singleton(el)).is_empty()
} }
/// Returns the number of elements in the TinySet. /// Returns the number of elements in the TinySet.
#[inline]
pub fn len(self) -> u32 { pub fn len(self) -> u32 {
self.0.count_ones() self.0.count_ones()
} }
/// Returns the intersection of `self` and `other` /// Returns the intersection of `self` and `other`
#[inline]
#[must_use]
pub fn intersect(self, other: TinySet) -> TinySet { pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0) TinySet(self.0 & other.0)
} }
@@ -97,23 +64,13 @@ impl TinySet {
TinySet(1u64 << u64::from(el)) TinySet(1u64 << u64::from(el))
} }
/// Insert a new element within [0..64) /// Insert a new element within [0..64[
#[inline] #[inline]
#[must_use]
pub fn insert(self, el: u32) -> TinySet { pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el)) self.union(TinySet::singleton(el))
} }
/// Removes an element within [0..64) /// Insert a new element within [0..64[
#[inline]
#[must_use]
pub fn remove(self, el: u32) -> TinySet {
self.intersect(TinySet::singleton(el).complement())
}
/// Insert a new element within [0..64)
///
/// returns true if the set changed
#[inline] #[inline]
pub fn insert_mut(&mut self, el: u32) -> bool { pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self; let old = *self;
@@ -121,19 +78,8 @@ impl TinySet {
old != *self old != *self
} }
/// Remove a element within [0..64)
///
/// returns true if the set changed
#[inline]
pub fn remove_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.remove(el);
old != *self
}
/// Returns the union of two tinysets /// Returns the union of two tinysets
#[inline] #[inline]
#[must_use]
pub fn union(self, other: TinySet) -> TinySet { pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0) TinySet(self.0 | other.0)
} }
@@ -177,7 +123,7 @@ impl TinySet {
#[derive(Clone)] #[derive(Clone)]
pub struct BitSet { pub struct BitSet {
tinysets: Box<[TinySet]>, tinysets: Box<[TinySet]>,
len: u64, len: usize,
max_value: u32, max_value: u32,
} }
@@ -186,47 +132,18 @@ fn num_buckets(max_val: u32) -> u32 {
} }
impl BitSet { impl BitSet {
/// serialize a `BitSet`.
///
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
for tinyset in self.tinysets.iter().cloned() {
writer.write_all(&tinyset.into_bytes())?;
}
writer.flush()?;
Ok(())
}
/// Create a new `BitSet` that may contain elements /// Create a new `BitSet` that may contain elements
/// within `[0, max_val)`. /// within `[0, max_val[`.
pub fn with_max_value(max_value: u32) -> BitSet { pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value); let num_buckets = num_buckets(max_value);
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice(); let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet { BitSet {
tinysets: tinybitsets, tinysets: tinybisets,
len: 0, len: 0,
max_value, max_value,
} }
} }
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
/// within `[0, max_val)`.
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
// Fix padding
let lower = max_value % 64u32;
if lower != 0 {
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
}
BitSet {
tinysets: tinybitsets,
len: max_value as u64,
max_value,
}
}
/// Removes all elements from the `BitSet`. /// Removes all elements from the `BitSet`.
pub fn clear(&mut self) { pub fn clear(&mut self) {
for tinyset in self.tinysets.iter_mut() { for tinyset in self.tinysets.iter_mut() {
@@ -234,28 +151,12 @@ impl BitSet {
} }
} }
/// Intersect with serialized bitset
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
self.intersect_update_with_iter(other.iter_tinysets());
}
/// Intersect with tinysets
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
self.len = 0;
for (left, right) in self.tinysets.iter_mut().zip(other) {
*left = left.intersect(right);
self.len += left.len() as u64;
}
}
/// Returns the number of elements in the `BitSet`. /// Returns the number of elements in the `BitSet`.
#[inline]
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
self.len as usize self.len
} }
/// Inserts an element in the `BitSet` /// Inserts an element in the `BitSet`
#[inline]
pub fn insert(&mut self, el: u32) { pub fn insert(&mut self, el: u32) {
// we do not check saturated els. // we do not check saturated els.
let higher = el / 64u32; let higher = el / 64u32;
@@ -267,21 +168,7 @@ impl BitSet {
}; };
} }
/// Inserts an element in the `BitSet`
#[inline]
pub fn remove(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`. /// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool { pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64) self.tinyset(el / 64u32).contains(el % 64)
} }
@@ -299,7 +186,6 @@ impl BitSet {
.map(|delta_bucket| bucket + delta_bucket as u32) .map(|delta_bucket| bucket + delta_bucket as u32)
} }
#[inline]
pub fn max_value(&self) -> u32 { pub fn max_value(&self) -> u32 {
self.max_value self.max_value
} }
@@ -312,237 +198,16 @@ impl BitSet {
} }
} }
/// Serialized BitSet.
#[derive(Clone)]
pub struct ReadOnlyBitSet {
data: OwnedBytes,
max_value: u32,
}
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
assert_eq!(left.max_value(), other.max_value());
assert_eq!(left.data.len(), other.data.len());
let union_tinyset_it = left
.iter_tinysets()
.zip(other.iter_tinysets())
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
for tinyset in union_tinyset_it {
output_dataset.extend_from_slice(&tinyset.into_bytes());
}
ReadOnlyBitSet {
data: OwnedBytes::new(output_dataset),
max_value: left.max_value(),
}
}
impl ReadOnlyBitSet {
pub fn open(data: OwnedBytes) -> Self {
let (max_value_data, data) = data.split(4);
assert_eq!(data.len() % 8, 0);
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
ReadOnlyBitSet { data, max_value }
}
/// Number of elements in the bitset.
#[inline]
pub fn len(&self) -> usize {
self.iter_tinysets()
.map(|tinyset| tinyset.len() as usize)
.sum()
}
/// Iterate the tinyset on the fly from serialized data.
///
#[inline]
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
self.data.chunks_exact(8).map(move |chunk| {
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
tinyset
})
}
/// Iterate over the positions of the elements.
///
#[inline]
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
self.iter_tinysets()
.enumerate()
.flat_map(move |(chunk_num, tinyset)| {
let chunk_base_val = chunk_num as u32 * 64;
tinyset
.into_iter()
.map(move |val| val + chunk_base_val)
.take_while(move |doc| *doc < self.max_value)
})
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool {
let byte_offset = el / 8u32;
let b: u8 = self.data[byte_offset as usize];
let shift = (el % 8) as u8;
b & (1u8 << shift) != 0
}
/// Maximum value the bitset may contain.
/// (Note this is not the maximum value contained in the set.)
///
/// A bitset has an intrinsic capacity.
/// It only stores elements within [0..max_value).
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
}
/// Number of bytes used in the bitset representation.
pub fn num_bytes(&self) -> usize {
self.data.len()
}
}
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
bitset
.serialize(&mut buffer)
.expect("serializing into a buffer should never fail");
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::BitSet; use super::BitSet;
use super::ReadOnlyBitSet;
use super::TinySet; use super::TinySet;
use ownedbytes::OwnedBytes;
use rand::distributions::Bernoulli; use rand::distributions::Bernoulli;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
use std::collections::HashSet; use std::collections::HashSet;
#[test]
fn test_read_serialized_bitset_full_multi() {
for i in 0..1000 {
let bitset = BitSet::with_max_value_and_full(i);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len() as usize, i as usize);
}
}
#[test]
fn test_read_serialized_bitset_full_block() {
let bitset = BitSet::with_max_value_and_full(64);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len() as usize, 64 as usize);
}
#[test]
fn test_read_serialized_bitset_full() {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 4);
}
#[test]
fn test_bitset_intersect() {
let bitset_serialized = {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(1);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
ReadOnlyBitSet::open(OwnedBytes::new(out))
};
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(1);
bitset.intersect_update(&bitset_serialized);
assert!(bitset.contains(0));
assert!(!bitset.contains(1));
assert!(bitset.contains(2));
assert!(!bitset.contains(3));
assert!(bitset.contains(4));
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
assert!(bitset.contains(0));
assert!(!bitset.contains(1));
assert!(!bitset.contains(2));
assert!(!bitset.contains(3));
assert!(!bitset.contains(4));
assert_eq!(bitset.len(), 1);
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
assert!(!bitset.contains(0));
assert!(!bitset.contains(1));
assert!(!bitset.contains(2));
assert!(!bitset.contains(3));
assert!(!bitset.contains(4));
assert_eq!(bitset.len(), 0);
}
#[test]
fn test_read_serialized_bitset_empty() {
let mut bitset = BitSet::with_max_value(5);
bitset.insert(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 1);
{
let bitset = BitSet::with_max_value(5);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 0);
}
}
#[test]
fn test_tiny_set_remove() {
{
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
assert_eq!(u.pop_lowest(), Some(5u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty()
.insert(63u32)
.insert(1)
.insert(5)
.remove(63u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert_eq!(u.pop_lowest(), Some(5u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(1).remove(63u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(1).remove(1u32);
assert!(u.pop_lowest().is_none());
}
}
#[test] #[test]
fn test_tiny_set() { fn test_tiny_set() {
assert!(TinySet::empty().is_empty()); assert!(TinySet::empty().is_empty());
@@ -568,17 +233,6 @@ mod tests {
assert_eq!(u.pop_lowest(), Some(63u32)); assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none()); assert!(u.pop_lowest().is_none());
} }
{
let mut u = TinySet::empty().insert(63u32).insert(5);
assert_eq!(u.pop_lowest(), Some(5u32));
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
{
let original = TinySet::empty().insert(63u32).insert(5);
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
assert_eq!(original, after_serialize_deserialize);
}
} }
#[test] #[test]
@@ -595,16 +249,6 @@ mod tests {
assert_eq!(hashset.contains(&el), bitset.contains(el)); assert_eq!(hashset.contains(&el), bitset.contains(el));
} }
assert_eq!(bitset.max_value(), max_value); assert_eq!(bitset.max_value(), max_value);
// test deser
let mut data = vec![];
bitset.serialize(&mut data).unwrap();
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
for el in 0..max_value {
assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
}
assert_eq!(ro_bitset.max_value(), max_value);
assert_eq!(ro_bitset.len(), els.len());
}; };
test_against_hashset(&[], 0); test_against_hashset(&[], 0);
@@ -669,14 +313,6 @@ mod tests {
assert_eq!(bitset.len(), 2); assert_eq!(bitset.len(), 2);
bitset.insert(104u32); bitset.insert(104u32);
assert_eq!(bitset.len(), 3); assert_eq!(bitset.len(), 3);
bitset.remove(105u32);
assert_eq!(bitset.len(), 3);
bitset.remove(104u32);
assert_eq!(bitset.len(), 2);
bitset.remove(3u32);
assert_eq!(bitset.len(), 1);
bitset.remove(103u32);
assert_eq!(bitset.len(), 0);
} }
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> { pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {

View File

@@ -1,5 +1,3 @@
#![allow(clippy::len_without_is_empty)]
use std::ops::Deref; use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness; pub use byteorder::LittleEndian as Endianness;

View File

@@ -54,7 +54,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
} }
} }
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly /// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
/// ///
/// The point is that while the type is public, it cannot be built by anyone /// The point is that while the type is public, it cannot be built by anyone
/// outside of this module. /// outside of this module.

View File

@@ -38,7 +38,7 @@ Note: Tantivy 0.16 does not do this optimization yet.
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?) In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
## Usage ## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used. The index sorting can be configured setting [`sort_by_field`](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
``` ```
let settings = IndexSettings { let settings = IndexSettings {
@@ -55,7 +55,7 @@ let index = index_builder.create_in_ram().unwrap();
## Implementation details ## Implementation details
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073). Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets). In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).

View File

@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
); );
// ... and add it to the `IndexWriter`. // ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc)?; index_writer.add_document(old_man_doc);
// For convenience, tantivy also comes with a macro to // For convenience, tantivy also comes with a macro to
// reduce the boilerplate above. // reduce the boilerplate above.
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
enterprise which you have regarded with such evil forebodings. I arrived here \ enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \ yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking." increasing confidence in the success of my undertaking."
))?; ));
// This is an example, so we will only index 3 documents // This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index // here. You can check out tantivy's tutorial to index

View File

@@ -145,23 +145,23 @@ fn main() -> tantivy::Result<()> {
product_description => "While it is ok for short distance travel, this broom \ product_description => "While it is ok for short distance travel, this broom \
was designed quiditch. It will up your game.", was designed quiditch. It will up your game.",
price => 30_200u64 price => 30_200u64
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Turbulobroom", product_name => "Turbulobroom",
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\ product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
You'll enjoy its sharp turns, and rapid acceleration", You'll enjoy its sharp turns, and rapid acceleration",
price => 29_240u64 price => 29_240u64
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Broomio", product_name => "Broomio",
product_description => "Great value for the price. This broom is a market favorite", product_description => "Great value for the price. This broom is a market favorite",
price => 21_240u64 price => 21_240u64
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Whack a Mole", product_name => "Whack a Mole",
product_description => "Prime quality bat.", product_description => "Prime quality bat.",
price => 5_200u64 price => 5_200u64
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish." he had gone eighty-four days now without taking a fish."
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the fresh and green with every spring, carrying in their lower leaf junctures the
debris of the winters flooding; and sycamores with mottled, white, recumbent debris of the winters flooding; and sycamores with mottled, white, recumbent
limbs and branches that arch over the pool"# limbs and branches that arch over the pool"#
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and yesterday, and my first task is to assure my dear sister of my welfare and
increasing confidence in the success of my undertaking."# increasing confidence in the success of my undertaking."#
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -76,15 +76,15 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
isbn => "978-0099908401", isbn => "978-0099908401",
title => "The old Man and the see" title => "The old Man and the see"
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
isbn => "978-0140177398", isbn => "978-0140177398",
title => "Of Mice and Men", title => "Of Mice and Men",
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankentein", //< Oops there is a typo here. title => "Frankentein", //< Oops there is a typo here.
isbn => "978-9176370711", isbn => "978-9176370711",
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
isbn => "978-9176370711", isbn => "978-9176370711",
))?; ));
// You are guaranteed that your clients will only observe your index in // You are guaranteed that your clients will only observe your index in
// the state it was in after a commit. // the state it was in after a commit.

View File

@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let name = schema_builder.add_text_field("felin_name", TEXT | STORED); let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
// this is our faceted field: its scientific classification // this is our faceted field: its scientific classification
let classification = schema_builder.add_facet_field("classification", FacetOptions::default()); let classification = schema_builder.add_facet_field("classification", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Cat", name => "Cat",
classification => Facet::from("/Felidae/Felinae/Felis") classification => Facet::from("/Felidae/Felinae/Felis")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Canada lynx", name => "Canada lynx",
classification => Facet::from("/Felidae/Felinae/Lynx") classification => Facet::from("/Felidae/Felinae/Lynx")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Cheetah", name => "Cheetah",
classification => Facet::from("/Felidae/Felinae/Acinonyx") classification => Facet::from("/Felidae/Felinae/Acinonyx")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Tiger", name => "Tiger",
classification => Facet::from("/Felidae/Pantherinae/Panthera") classification => Facet::from("/Felidae/Pantherinae/Panthera")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Lion", name => "Lion",
classification => Facet::from("/Felidae/Pantherinae/Panthera") classification => Facet::from("/Felidae/Pantherinae/Panthera")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Jaguar", name => "Jaguar",
classification => Facet::from("/Felidae/Pantherinae/Panthera") classification => Facet::from("/Felidae/Pantherinae/Panthera")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Sunda clouded leopard", name => "Sunda clouded leopard",
classification => Facet::from("/Felidae/Pantherinae/Neofelis") classification => Facet::from("/Felidae/Pantherinae/Neofelis")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Fossa", name => "Fossa",
classification => Facet::from("/Eupleridae/Cryptoprocta") classification => Facet::from("/Eupleridae/Cryptoprocta")
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", STORED); let title = schema_builder.add_text_field("title", STORED);
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default()); let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
title => "Fried egg", title => "Fried egg",
ingredient => Facet::from("/ingredient/egg"), ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/oil"), ingredient => Facet::from("/ingredient/oil"),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Scrambled egg", title => "Scrambled egg",
ingredient => Facet::from("/ingredient/egg"), ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/butter"), ingredient => Facet::from("/ingredient/butter"),
ingredient => Facet::from("/ingredient/milk"), ingredient => Facet::from("/ingredient/milk"),
ingredient => Facet::from("/ingredient/salt"), ingredient => Facet::from("/ingredient/salt"),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Egg rolls", title => "Egg rolls",
ingredient => Facet::from("/ingredient/egg"), ingredient => Facet::from("/ingredient/egg"),
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
ingredient => Facet::from("/ingredient/oil"), ingredient => Facet::from("/ingredient/oil"),
ingredient => Facet::from("/ingredient/tortilla-wrap"), ingredient => Facet::from("/ingredient/tortilla-wrap"),
ingredient => Facet::from("/ingredient/mushroom"), ingredient => Facet::from("/ingredient/mushroom"),
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED}; use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, Result}; use tantivy::{doc, Index, Result};
fn main() -> Result<()> { fn run() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field // For the sake of simplicity, this schema will only have 1 field
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -19,7 +19,7 @@ fn main() -> Result<()> {
{ {
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 { for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?; index_writer.add_document(doc!(year_field => year));
} }
index_writer.commit()?; index_writer.commit()?;
// The index will be a range of years // The index will be a range of years
@@ -33,3 +33,7 @@ fn main() -> Result<()> {
assert_eq!(num_60s_books, 10); assert_eq!(num_60s_books, 10);
Ok(()) Ok(())
} }
fn main() {
run().unwrap()
}

View File

@@ -25,9 +25,9 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?; index_writer.add_document(doc!(title => "The Old Man and the Sea"));
index_writer.add_document(doc!(title => "Of Mice and Men"))?; index_writer.add_document(doc!(title => "Of Mice and Men"));
index_writer.add_document(doc!(title => "The modern Promotheus"))?; index_writer.add_document(doc!(title => "The modern Promotheus"));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -29,7 +29,7 @@ use std::sync::{Arc, RwLock};
use std::thread; use std::thread;
use std::time::Duration; use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT}; use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError}; use tantivy::{doc, Index, IndexWriter, Opstamp};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -59,11 +59,10 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
println!("add doc {} from thread 1 - opstamp {}", i, opstamp); println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20)); thread::sleep(Duration::from_millis(20));
} }
Result::<(), TantivyError>::Ok(())
}); });
// # Second indexing thread. // # Second indexing thread.
@@ -79,12 +78,11 @@ fn main() -> tantivy::Result<()> {
index_writer_rlock.add_document(doc!( index_writer_rlock.add_document(doc!(
title => "Manufacturing consent", title => "Manufacturing consent",
body => "Some great book description..." body => "Some great book description..."
))? ))
}; };
println!("add doc {} from thread 2 - opstamp {}", i, opstamp); println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10)); thread::sleep(Duration::from_millis(10));
} }
Result::<(), TantivyError>::Ok(())
}); });
// # In the main thread, we commit 10 times, once every 500ms. // # In the main thread, we commit 10 times, once every 500ms.
@@ -92,7 +90,7 @@ fn main() -> tantivy::Result<()> {
let opstamp: Opstamp = { let opstamp: Opstamp = {
// Committing or rollbacking on the other hand requires write lock. This will block other threads. // Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap(); let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit()? index_writer_wlock.commit().unwrap()
}; };
println!("committed with opstamp {}", opstamp); println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500)); thread::sleep(Duration::from_millis(500));

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
let old_man_doc = doc!(title => title_tok, body => body_tok); let old_man_doc = doc!(title => title_tok, body => body_tok);
// ... now let's just add it to the IndexWriter // ... now let's just add it to the IndexWriter
index_writer.add_document(old_man_doc)?; index_writer.add_document(old_man_doc);
// Pretokenized text can also be fed as JSON // Pretokenized text can also be fed as JSON
let short_man_json = r#"{ let short_man_json = r#"{
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
let short_man_doc = schema.parse_document(short_man_json)?; let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?; index_writer.add_document(short_man_doc);
// Let's commit changes // Let's commit changes
index_writer.commit()?; index_writer.commit()?;
@@ -106,7 +106,9 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?; let (top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
assert_eq!(count, 2); assert_eq!(count, 2);
@@ -127,7 +129,9 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?; let (_top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
assert_eq!(count, 0); assert_eq!(count, 0);

View File

@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
// ... // ...
index_writer.commit()?; index_writer.commit()?;
@@ -70,13 +70,13 @@ fn highlight(snippet: Snippet) -> String {
let mut start_from = 0; let mut start_from = 0;
for fragment_range in snippet.highlighted() { for fragment_range in snippet.highlighted() {
result.push_str(&snippet.fragment()[start_from..fragment_range.start]); result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
result.push_str(" --> "); result.push_str(" --> ");
result.push_str(&snippet.fragment()[fragment_range.clone()]); result.push_str(&snippet.fragments()[fragment_range.clone()]);
result.push_str(" <-- "); result.push_str(" <-- ");
start_from = fragment_range.end; start_from = fragment_range.end;
} }
result.push_str(&snippet.fragment()[start_from..]); result.push_str(&snippet.fragments()[start_from..]);
result result
} }

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish." he had gone eighty-four days now without taking a fish."
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
enterprise which you have regarded with such evil forebodings. I arrived here \ enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \ yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking." increasing confidence in the success of my undertaking."
))?; ));
index_writer.commit()?; index_writer.commit()?;

View File

@@ -1,223 +0,0 @@
use std::cmp::Reverse;
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak};
use tantivy::collector::TopDocs;
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, TEXT};
use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader, TrackedObject};
use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer};
// This example shows how warmers can be used to
// load a values from an external sources using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
type ProductId = u64;
/// Price
type Price = u32;
pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
struct DynamicPriceColumn {
field: Field,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
impl DynamicPriceColumn {
pub fn with_product_id_field<T: PriceFetcher>(field: Field, price_fetcher: T) -> Self {
DynamicPriceColumn {
field,
price_cache: Default::default(),
price_fetcher: Box::new(price_fetcher),
}
}
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
self.price_cache.read().unwrap().get(&segment_key).cloned()
}
}
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment
.doc_ids_alive()
.map(|doc| product_id_reader.get(doc))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(price_vals));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]) {
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
.collect();
}
}
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
/// This map represents a map (ProductId -> Price)
///
/// In practise, it could be fetching things from an external service, like a SQL table.
///
#[derive(Default, Clone)]
pub struct ExternalPriceTable {
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
}
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices_read = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
fn main() -> tantivy::Result<()> {
// Declaring our schema.
let mut schema_builder = Schema::builder();
// The product id is assumed to be a primary id for our external price source.
let product_id = schema_builder.add_u64_field("product_id", FAST);
let text = schema_builder.add_text_field("text", TEXT);
let schema: Schema = schema_builder.build();
let price_table = ExternalPriceTable::default();
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
product_id,
price_table.clone(),
));
price_table.update_price(OLIVE_OIL, 12);
price_table.update_price(GLOVES, 13);
price_table.update_price(SNEAKERS, 80);
const OLIVE_OIL: ProductId = 323423;
const GLOVES: ProductId = 3966623;
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index
.reader_builder()
.warmers(warmers)
.num_searchers(1)
.try_into()?;
reader.reload()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;
let searcher = reader.searcher();
let score_by_price = move |segment_reader: &SegmentReader| {
let price = price_dynamic_column
.price_for_segment(segment_reader)
.unwrap();
move |doc_id: DocId| Reverse(price[doc_id as usize])
};
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
let hits = searcher.search(&query, &most_expensive_first)?;
assert_eq!(
&hits,
&[
(
Reverse(12u32),
DocAddress {
segment_ord: 0,
doc_id: 0u32
}
),
(
Reverse(13u32),
DocAddress {
segment_ord: 0,
doc_id: 1u32
}
),
]
);
// Olive oil just got more expensive!
price_table.update_price(OLIVE_OIL, 15);
// The price update are directly reflected on `reload`.
//
// Be careful here though!...
// You may have spotted that we are still using the same `Searcher`.
//
// It is up to the `Warmer` implementer to decide how
// to control this behavior.
reader.reload()?;
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
assert_eq!(
&hits_with_new_prices,
&[
(
Reverse(13u32),
DocAddress {
segment_ord: 0,
doc_id: 1u32
}
),
(
Reverse(15u32),
DocAddress {
segment_ord: 0,
doc_id: 0u32
}
),
]
);
Ok(())
}

View File

@@ -118,7 +118,7 @@ mod tests {
); );
} }
} }
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let actual_compression = data.len() as f32 / out.len() as f32;
(estimation, actual_compression) (estimation, actual_compression)
} }
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> { pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {

View File

@@ -239,21 +239,11 @@ mod tests {
use super::*; use super::*;
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::< crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer, LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader, LinearInterpolFastFieldReader,
>(data, name) >(data, name);
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.01);
assert!(estimate < 0.01);
} }
#[test] #[test]

View File

@@ -57,7 +57,7 @@ struct Function {
impl Function { impl Function {
fn calc_slope(&mut self) { fn calc_slope(&mut self) {
let num_vals = self.end_pos - self.start_pos; let num_vals = self.end_pos - self.start_pos;
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals); get_slope(self.value_start_pos, self.value_end_pos, num_vals);
} }
// split the interpolation into two function, change self and return the second split // split the interpolation into two function, change self and return the second split
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function { fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
@@ -378,22 +378,11 @@ mod tests {
use super::*; use super::*;
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::< crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldReader,
>(data, name) >(data, name);
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
assert!(actual_compression > 0.01);
} }
#[test] #[test]
@@ -425,11 +414,9 @@ mod tests {
fn rand() { fn rand() {
for _ in 0..10 { for _ in 0..10 {
let mut data = (5_000..20_000) let mut data = (5_000..20_000)
.map(|_| rand::random::<u32>() as u64) .map(|_| rand::random::<u64>() as u64)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let (estimate, actual_compression) = create_and_validate(&data, "random"); create_and_validate(&data, "random");
dbg!(estimate);
dbg!(actual_compression);
data.reverse(); data.reverse();
create_and_validate(&data, "random"); create_and_validate(&data, "random");

View File

@@ -1,7 +1,7 @@
[package] [package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"] authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes" name = "ownedbytes"
version = "0.2.0" version = "0.1.0"
edition = "2018" edition = "2018"
description = "Expose data as static slice" description = "Expose data as static slice"
license = "MIT" license = "MIT"

View File

@@ -1,5 +1,3 @@
#![allow(clippy::return_self_not_must_use)]
use stable_deref_trait::StableDeref; use stable_deref_trait::StableDeref;
use std::convert::TryInto; use std::convert::TryInto;
use std::mem; use std::mem;
@@ -37,8 +35,6 @@ impl OwnedBytes {
} }
/// creates a fileslice that is just a view over a slice of the data. /// creates a fileslice that is just a view over a slice of the data.
#[must_use]
#[inline]
pub fn slice(&self, range: Range<usize>) -> Self { pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes { OwnedBytes {
data: &self.data[range], data: &self.data[range],
@@ -67,8 +63,6 @@ impl OwnedBytes {
/// On the other hand, both `left` and `right` retain a handle over /// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only /// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped. /// be released when both left and right are dropped.
#[inline]
#[must_use]
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) { pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone(); let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes { let left = OwnedBytes {
@@ -82,19 +76,6 @@ impl OwnedBytes {
(left, right) (left, right)
} }
/// Splits the right part of the `OwnedBytes` at the given offset.
///
/// `self` is truncated to `split_len`, left with the remaining bytes.
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
let right_box_stable_deref = self.box_stable_deref.clone();
let right_piece = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
self.data = &self.data[..split_len];
right_piece
}
/// Returns true iff this `OwnedBytes` is empty. /// Returns true iff this `OwnedBytes` is empty.
#[inline] #[inline]
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
@@ -103,6 +84,7 @@ impl OwnedBytes {
/// Drops the left most `advance_len` bytes. /// Drops the left most `advance_len` bytes.
/// ///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline] #[inline]
pub fn advance(&mut self, advance_len: usize) { pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..] self.data = &self.data[advance_len..]
@@ -142,35 +124,6 @@ impl fmt::Debug for OwnedBytes {
} }
} }
impl PartialEq for OwnedBytes {
fn eq(&self, other: &OwnedBytes) -> bool {
self.as_slice() == other.as_slice()
}
}
impl Eq for OwnedBytes {}
impl PartialEq<[u8]> for OwnedBytes {
fn eq(&self, other: &[u8]) -> bool {
self.as_slice() == other
}
}
impl PartialEq<str> for OwnedBytes {
fn eq(&self, other: &str) -> bool {
self.as_slice() == other.as_bytes()
}
}
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
where
OwnedBytes: PartialEq<T>,
{
fn eq(&self, other: &&'a T) -> bool {
*self == **other
}
}
impl Deref for OwnedBytes { impl Deref for OwnedBytes {
type Target = [u8]; type Target = [u8];
@@ -334,14 +287,4 @@ mod tests {
assert_eq!(right.as_slice(), b""); assert_eq!(right.as_slice(), b"");
} }
} }
#[test]
fn test_split_off() {
let mut data = OwnedBytes::new(b"abcdef".as_ref());
assert_eq!(data, "abcdef");
assert_eq!(data.split_off(2), "cdef");
assert_eq!(data, "ab");
assert_eq!(data.split_off(1), "b");
assert_eq!(data, "a");
}
} }

View File

@@ -5,9 +5,9 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
description = """Search engine library""" description = """Search engine library"""
documentation = "https://quickwit-inc.github.io/tantivy/tantivy/index.html" documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/quickwit-inc/tantivy" homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/quickwit-inc/tantivy" repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"

View File

@@ -91,7 +91,6 @@ pub enum UserInputAst {
} }
impl UserInputAst { impl UserInputAst {
#[must_use]
pub fn unary(self, occur: Occur) -> UserInputAst { pub fn unary(self, occur: Occur) -> UserInputAst {
UserInputAst::Clause(vec![(Some(occur), self)]) UserInputAst::Clause(vec![(Some(occur), self)])
} }

View File

@@ -20,10 +20,10 @@ use crate::SegmentReader;
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer(3_000_000).unwrap(); /// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap(); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap(); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap(); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap(); /// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// assert!(index_writer.commit().is_ok()); /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader().unwrap(); /// let reader = index.reader().unwrap();

View File

@@ -83,7 +83,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// ```rust /// ```rust
/// use tantivy::collector::FacetCollector; /// use tantivy::collector::FacetCollector;
/// use tantivy::query::AllQuery; /// use tantivy::query::AllQuery;
/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT}; /// use tantivy::schema::{Facet, Schema, INDEXED, TEXT};
/// use tantivy::{doc, Index}; /// use tantivy::{doc, Index};
/// ///
/// fn example() -> tantivy::Result<()> { /// fn example() -> tantivy::Result<()> {
@@ -92,7 +92,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// // Facet have their own specific type. /// // Facet have their own specific type.
/// // It is not a bad practise to put all of your /// // It is not a bad practise to put all of your
/// // facet information in the same field. /// // facet information in the same field.
/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default()); /// let facet = schema_builder.add_facet_field("facet", INDEXED);
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
@@ -103,23 +103,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// title => "The Name of the Wind", /// title => "The Name of the Wind",
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/fantasy") /// facet => Facet::from("/category/fiction/fantasy")
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "Dune", /// title => "Dune",
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/sci-fi") /// facet => Facet::from("/category/fiction/sci-fi")
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "La Vénus d'Ille", /// title => "La Vénus d'Ille",
/// facet => Facet::from("/lang/fr"), /// facet => Facet::from("/lang/fr"),
/// facet => Facet::from("/category/fiction/fantasy"), /// facet => Facet::from("/category/fiction/fantasy"),
/// facet => Facet::from("/category/fiction/horror") /// facet => Facet::from("/category/fiction/horror")
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl", /// title => "The Diary of a Young Girl",
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography") /// facet => Facet::from("/category/biography")
/// ))?; /// ));
/// index_writer.commit()?; /// index_writer.commit()?;
/// } /// }
/// let reader = index.reader()?; /// let reader = index.reader()?;
@@ -400,7 +400,7 @@ impl<'a> Iterator for FacetChildIterator<'a> {
impl FacetCounts { impl FacetCounts {
/// Returns an iterator over all of the facet count pairs inside this result. /// Returns an iterator over all of the facet count pairs inside this result.
/// See the documentation for [FacetCollector] for a usage example. /// See the documentation for `FacetCollector` for a usage example.
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_> pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
where where
Facet: From<T>, Facet: From<T>,
@@ -421,7 +421,7 @@ impl FacetCounts {
} }
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts. /// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
/// See the documentation for [FacetCollector] for a usage example. /// See the documentation for `FacetCollector` for a usage example.
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)> pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
where where
Facet: From<T>, Facet: From<T>,
@@ -462,7 +462,7 @@ mod tests {
use crate::collector::Count; use crate::collector::Count;
use crate::core::Index; use crate::core::Index;
use crate::query::{AllQuery, QueryParser, TermQuery}; use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema}; use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED};
use crate::Term; use crate::Term;
use rand::distributions::Uniform; use rand::distributions::Uniform;
use rand::prelude::SliceRandom; use rand::prelude::SliceRandom;
@@ -470,13 +470,13 @@ mod tests {
use std::iter; use std::iter;
#[test] #[test]
fn test_facet_collector_drilldown() -> crate::Result<()> { fn test_facet_collector_drilldown() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5; let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets) let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| { .map(|mut n| {
@@ -491,14 +491,14 @@ mod tests {
for i in 0..num_facets * 10 { for i in 0..num_facets * 10 {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone()); doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1")); facet_collector.add_facet(Facet::from("/top1"));
let counts = searcher.search(&AllQuery, &facet_collector)?; let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
{ {
let facets: Vec<(String, u64)> = counts let facets: Vec<(String, u64)> = counts
@@ -518,7 +518,6 @@ mod tests {
.collect::<Vec<_>>() .collect::<Vec<_>>()
); );
} }
Ok(())
} }
#[test] #[test]
@@ -531,49 +530,48 @@ mod tests {
} }
#[test] #[test]
fn test_doc_unsorted_multifacet() -> crate::Result<()> { fn test_doc_unsorted_multifacet() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facets", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(), facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(), facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(), facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(), facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
))?; ));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1); assert_eq!(searcher.num_docs(), 1);
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/subjects"); facet_collector.add_facet("/subjects");
let counts = searcher.search(&AllQuery, &facet_collector)?; let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect(); let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
assert_eq!(facets[0].1, 1); assert_eq!(facets[0].1, 1);
Ok(())
} }
#[test] #[test]
fn test_doc_search_by_facet() -> crate::Result<()> { fn test_doc_search_by_facet() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A").unwrap(), facet_field => Facet::from_text(&"/A/A").unwrap(),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/B").unwrap(), facet_field => Facet::from_text(&"/A/B").unwrap(),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/C/A").unwrap(), facet_field => Facet::from_text(&"/A/C/A").unwrap(),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A").unwrap(), facet_field => Facet::from_text(&"/D/C/A").unwrap(),
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -615,7 +613,7 @@ mod tests {
#[test] #[test]
fn test_facet_collector_topk() { fn test_facet_collector_topk() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -639,7 +637,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
@@ -664,7 +662,7 @@ mod tests {
#[test] #[test]
fn test_facet_collector_topk_tie_break() -> crate::Result<()> { fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -679,7 +677,7 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for doc in docs { for doc in docs {
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; index_writer.commit()?;
@@ -727,7 +725,7 @@ mod bench {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();

View File

@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::Field; use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError}; use crate::{Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate. /// The `FilterCollector` collector filters docs using a fast field value and a predicate.
/// Only the documents for which the predicate returned "true" will be passed on to the next collector. /// Only the documents for which the predicate returned "true" will be passed on to the next collector.
/// ///
/// ```rust /// ```rust
@@ -25,37 +25,34 @@ use crate::{Score, SegmentReader, TantivyError};
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST}; /// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::{doc, DocAddress, Index}; /// use tantivy::{doc, DocAddress, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST); /// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?; /// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?; /// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?; /// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?; /// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
/// index_writer.commit()?; /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader()?; /// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary").unwrap();
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2)); /// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?; /// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
/// ///
/// assert_eq!(top_docs.len(), 1); /// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// ///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2)); /// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?; /// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
/// ///
/// assert_eq!(filtered_top_docs.len(), 0); /// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ``` /// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue> pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
where where

View File

@@ -226,10 +226,10 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?; let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64))?; writer.add_document(doc!(val_field=>12i64));
writer.add_document(doc!(val_field=>-30i64))?; writer.add_document(doc!(val_field=>-30i64));
writer.add_document(doc!(val_field=>-12i64))?; writer.add_document(doc!(val_field=>-12i64));
writer.add_document(doc!(val_field=>-10i64))?; writer.add_document(doc!(val_field=>-10i64));
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -247,13 +247,13 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?; let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64))?; writer.add_document(doc!(val_field=>12i64));
writer.commit()?; writer.commit()?;
writer.add_document(doc!(val_field=>-30i64))?; writer.add_document(doc!(val_field=>-30i64));
writer.commit()?; writer.commit()?;
writer.add_document(doc!(val_field=>-12i64))?; writer.add_document(doc!(val_field=>-12i64));
writer.commit()?; writer.commit()?;
writer.add_document(doc!(val_field=>-10i64))?; writer.add_document(doc!(val_field=>-10i64));
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -271,9 +271,9 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?; let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?; writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?; writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?; writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();

View File

@@ -48,10 +48,10 @@ use tantivy::collector::{Count, TopDocs};
# let mut index_writer = index.writer(3_000_000)?; # let mut index_writer = index.writer(3_000_000)?;
# index_writer.add_document(doc!( # index_writer.add_document(doc!(
# title => "The Name of the Wind", # title => "The Name of the Wind",
# ))?; # ));
# index_writer.add_document(doc!( # index_writer.add_document(doc!(
# title => "The Diary of Muadib", # title => "The Diary of Muadib",
# ))?; # ));
# index_writer.commit()?; # index_writer.commit()?;
# let reader = index.reader()?; # let reader = index.reader()?;
# let searcher = reader.searcher(); # let searcher = reader.searcher();
@@ -178,9 +178,9 @@ pub trait Collector: Sync + Send {
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> { ) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?; let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
if let Some(alive_bitset) = reader.alive_bitset() { if let Some(delete_bitset) = reader.delete_bitset() {
weight.for_each(reader, &mut |doc, score| { weight.for_each(reader, &mut |doc, score| {
if alive_bitset.is_alive(doc) { if delete_bitset.is_alive(doc) {
segment_collector.collect(doc, score); segment_collector.collect(doc, score);
} }
})?; })?;

View File

@@ -112,19 +112,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index}; /// use tantivy::{doc, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
/// index_writer.commit()?;
/// ///
/// let reader = index.reader()?; /// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// assert!(index_writer.commit().is_ok());
///
/// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let mut collectors = MultiCollector::new(); /// let mut collectors = MultiCollector::new();
@@ -139,8 +139,6 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// ///
/// assert_eq!(count, 2); /// assert_eq!(count, 2);
/// assert_eq!(top_docs.len(), 2); /// assert_eq!(top_docs.len(), 2);
/// # Ok(())
/// # }
/// ``` /// ```
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
#[derive(Default)] #[derive(Default)]
@@ -254,24 +252,24 @@ mod tests {
use crate::Term; use crate::Term;
#[test] #[test]
fn test_multi_collector() -> crate::Result<()> { fn test_multi_collector() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc"))?; index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"))?; index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"))?; index_writer.add_document(doc!(text=>"abc abc"));
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""))?; index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"))?; index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"))?; index_writer.add_document(doc!(text=>"abc"));
index_writer.commit()?; index_writer.commit().unwrap();
} }
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let term = Term::from_field_text(text, "abc"); let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic); let query = TermQuery::new(term, IndexRecordOption::Basic);
@@ -282,6 +280,5 @@ mod tests {
assert_eq!(count_handler.extract(&mut multifruits), 5); assert_eq!(count_handler.extract(&mut multifruits), 5);
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2); assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
Ok(())
} }
} }

View File

@@ -25,7 +25,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
}; };
#[test] #[test]
pub fn test_filter_collector() -> crate::Result<()> { pub fn test_filter_collector() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let price = schema_builder.add_u64_field("price", FAST); let price = schema_builder.add_u64_field("price", FAST);
@@ -33,25 +33,25 @@ pub fn test_filter_collector() -> crate::Result<()> {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]); let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary")?; let query = query_parser.parse_query("diary").unwrap();
let filter_some_collector = FilterCollector::new( let filter_some_collector = FilterCollector::new(
price, price,
&|value: u64| value > 20_120u64, &|value: u64| value > 20_120u64,
TopDocs::with_limit(2), TopDocs::with_limit(2),
); );
let top_docs = searcher.search(&query, &filter_some_collector)?; let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
assert_eq!(top_docs.len(), 1); assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
@@ -67,10 +67,9 @@ pub fn test_filter_collector() -> crate::Result<()> {
} }
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5)); let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?; let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
assert_eq!(filtered_date_docs.len(), 2); assert_eq!(filtered_date_docs.len(), 2);
Ok(())
} }
/// Stores all of the doc ids. /// Stores all of the doc ids.
@@ -275,8 +274,8 @@ fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.commit()?; index_writer.commit()?;
Ok(index.reader()?.searcher()) Ok(index.reader()?.searcher())
} }

View File

@@ -70,7 +70,9 @@ where
/// # Panics /// # Panics
/// The method panics if limit is 0 /// The method panics if limit is 0
pub fn with_limit(limit: usize) -> TopCollector<T> { pub fn with_limit(limit: usize) -> TopCollector<T> {
assert!(limit >= 1, "Limit must be strictly greater than 0."); if limit < 1 {
panic!("Limit must be strictly greater than 0.");
}
Self { Self {
limit, limit,
offset: 0, offset: 0,

View File

@@ -94,30 +94,27 @@ where
/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index}; /// use tantivy::{doc, DocAddress, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?; /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?; /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?; /// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?; /// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.commit()?; /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader()?; /// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?; /// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
/// ///
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3)); /// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ``` /// ```
pub struct TopDocs(TopCollector<Score>); pub struct TopDocs(TopCollector<Score>);
@@ -183,34 +180,30 @@ impl TopDocs {
/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index}; /// use tantivy::{doc, DocAddress, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?; /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?; /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?; /// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?; /// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?; /// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
/// index_writer.commit()?; /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader()?; /// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?; /// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
/// ///
/// assert_eq!(top_docs.len(), 2); /// assert_eq!(top_docs.len(), 2);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3)); /// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// Ok(())
/// # }
/// ``` /// ```
#[must_use]
pub fn and_offset(self, offset: usize) -> TopDocs { pub fn and_offset(self, offset: usize) -> TopDocs {
TopDocs(self.0.and_offset(offset)) TopDocs(self.0.and_offset(offset))
} }
@@ -241,11 +234,11 @@ impl TopDocs {
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?; /// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?; /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?; /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?; /// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
/// # index_writer.commit()?; /// # assert!(index_writer.commit().is_ok());
/// # let reader = index.reader()?; /// # let reader = index.reader()?;
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?; /// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?; /// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
@@ -323,9 +316,9 @@ impl TopDocs {
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?; /// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?; /// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?; /// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
/// # assert!(index_writer.commit().is_ok()); /// # assert!(index_writer.commit().is_ok());
/// # let reader = index.reader()?; /// # let reader = index.reader()?;
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?; /// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
@@ -424,9 +417,9 @@ impl TopDocs {
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap(); /// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?; /// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?; /// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?; /// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
/// index_writer.commit()?; /// index_writer.commit()?;
/// Ok(index) /// Ok(index)
/// } /// }
@@ -534,9 +527,9 @@ impl TopDocs {
/// # /// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// let boosted: Field = index.schema().get_field("boosted").unwrap(); /// let boosted: Field = index.schema().get_field("boosted").unwrap();
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?; /// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?; /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?; /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
/// # index_writer.commit()?; /// # index_writer.commit()?;
/// // ... /// // ...
/// # let user_query = "diary"; /// # let user_query = "diary";
@@ -636,10 +629,10 @@ impl Collector for TopDocs {
let heap_len = self.0.limit + self.0.offset; let heap_len = self.0.limit + self.0.offset;
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len); let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(alive_bitset) = reader.alive_bitset() { if let Some(delete_bitset) = reader.delete_bitset() {
let mut threshold = Score::MIN; let mut threshold = Score::MIN;
weight.for_each_pruning(threshold, reader, &mut |doc, score| { weight.for_each_pruning(threshold, reader, &mut |doc, score| {
if alive_bitset.is_deleted(doc) { if delete_bitset.is_deleted(doc) {
return threshold; return threshold;
} }
let heap_item = ComparableDoc { let heap_item = ComparableDoc {
@@ -720,18 +713,20 @@ mod tests {
use crate::Score; use crate::Score;
use crate::{DocAddress, DocId, SegmentReader}; use crate::{DocAddress, DocId, SegmentReader};
fn make_index() -> crate::Result<Index> { fn make_index() -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; // writing the segment
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?; let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?; index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"I like Droopy"))?; index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.commit()?; index_writer.add_document(doc!(text_field=>"I like Droopy"));
Ok(index) assert!(index_writer.commit().is_ok());
}
index
} }
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) { fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
@@ -742,15 +737,17 @@ mod tests {
} }
#[test] #[test]
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> { fn test_top_collector_not_at_capacity_without_offset() {
let index = make_index()?; let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax")?; let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index let score_docs: Vec<(Score, DocAddress)> = index
.reader()? .reader()
.unwrap()
.searcher() .searcher()
.search(&text_query, &TopDocs::with_limit(4))?; .search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_results_equals( assert_results_equals(
&score_docs, &score_docs,
&[ &[
@@ -759,12 +756,11 @@ mod tests {
(0.48527452, DocAddress::new(0, 0)), (0.48527452, DocAddress::new(0, 0)),
], ],
); );
Ok(())
} }
#[test] #[test]
fn test_top_collector_not_at_capacity_with_offset() { fn test_top_collector_not_at_capacity_with_offset() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -779,7 +775,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_at_capacity() { fn test_top_collector_at_capacity() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -800,7 +796,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_at_capacity_with_offset() { fn test_top_collector_at_capacity_with_offset() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -821,7 +817,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_stable_sorting() { fn test_top_collector_stable_sorting() {
let index = make_index().unwrap(); let index = make_index();
// using AllQuery to get a constant score // using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
@@ -852,35 +848,29 @@ mod tests {
const SIZE: &str = "size"; const SIZE: &str = "size";
#[test] #[test]
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> { fn test_top_field_collector_not_at_capacity() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT); let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST); let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let (index, query) = index("beer", title, schema, |index_writer| { let (index, query) = index("beer", title, schema, |index_writer| {
index_writer index_writer.add_document(doc!(
.add_document(doc!( title => "bottle of beer",
title => "bottle of beer", size => 12u64,
size => 12u64, ));
)) index_writer.add_document(doc!(
.unwrap(); title => "growler of beer",
index_writer size => 64u64,
.add_document(doc!( ));
title => "growler of beer", index_writer.add_document(doc!(
size => 64u64, title => "pint of beer",
)) size => 16u64,
.unwrap(); ));
index_writer
.add_document(doc!(
title => "pint of beer",
size => 16u64,
))
.unwrap();
}); });
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size); let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?; let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
assert_eq!( assert_eq!(
&top_docs[..], &top_docs[..],
&[ &[
@@ -889,7 +879,6 @@ mod tests {
(12, DocAddress::new(0, 0)) (12, DocAddress::new(0, 0))
] ]
); );
Ok(())
} }
#[test] #[test]
@@ -905,12 +894,12 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Paul Robeson", name => "Paul Robeson",
birthday => pr_birthday birthday => pr_birthday
))?; ));
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?; let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Minnie Riperton", name => "Minnie Riperton",
birthday => mr_birthday birthday => mr_birthday
))?; ));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday); let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
@@ -937,11 +926,11 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "georgetown", city => "georgetown",
altitude => -1i64, altitude => -1i64,
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "tokyo", city => "tokyo",
altitude => 40i64, altitude => 40i64,
))?; ));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude); let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
@@ -967,11 +956,11 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "georgetown", city => "georgetown",
altitude => -1.0f64, altitude => -1.0f64,
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "tokyo", city => "tokyo",
altitude => 40f64, altitude => 40f64,
))?; ));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude); let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
@@ -994,12 +983,10 @@ mod tests {
let size = schema_builder.add_u64_field(SIZE, FAST); let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let (index, _) = index("beer", title, schema, |index_writer| { let (index, _) = index("beer", title, schema, |index_writer| {
index_writer index_writer.add_document(doc!(
.add_document(doc!( title => "bottle of beer",
title => "bottle of beer", size => 12u64,
size => 12u64, ));
))
.unwrap();
}); });
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2)); let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
@@ -1016,7 +1003,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(size=>1u64))?; index_writer.add_document(doc!(size=>1u64));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0); let segment = searcher.segment_reader(0);
@@ -1033,7 +1020,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(size=>1u64))?; index_writer.add_document(doc!(size=>1u64));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0); let segment = searcher.segment_reader(0);
@@ -1046,26 +1033,30 @@ mod tests {
} }
#[test] #[test]
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> { fn test_tweak_score_top_collector_with_offset() {
let index = make_index()?; let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax")?; let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score( let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc, move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
); );
let score_docs: Vec<(u32, DocAddress)> = let score_docs: Vec<(u32, DocAddress)> = index
index.reader()?.searcher().search(&text_query, &collector)?; .reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
assert_eq!( assert_eq!(
score_docs, score_docs,
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),] vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
); );
Ok(())
} }
#[test] #[test]
fn test_custom_score_top_collector_with_offset() { fn test_custom_score_top_collector_with_offset() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();

View File

@@ -123,8 +123,8 @@ impl IndexBuilder {
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error. /// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> { pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?); let mmap_directory = MmapDirectory::open(directory_path)?;
if Index::exists(&*mmap_directory)? { if Index::exists(&mmap_directory)? {
return Err(TantivyError::IndexAlreadyExists); return Err(TantivyError::IndexAlreadyExists);
} }
self.create(mmap_directory) self.create(mmap_directory)
@@ -139,7 +139,7 @@ impl IndexBuilder {
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`. /// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub fn create_from_tempdir(self) -> crate::Result<Index> { pub fn create_from_tempdir(self) -> crate::Result<Index> {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?); let mmap_directory = MmapDirectory::create_from_tempdir()?;
self.create(mmap_directory) self.create(mmap_directory)
} }
fn get_expect_schema(&self) -> crate::Result<Schema> { fn get_expect_schema(&self) -> crate::Result<Schema> {
@@ -149,9 +149,8 @@ impl IndexBuilder {
.ok_or(TantivyError::IndexBuilderMissingArgument("schema")) .ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
} }
/// Opens or creates a new index in the provided directory /// Opens or creates a new index in the provided directory
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> { pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
let dir = dir.into(); if !Index::exists(&dir)? {
if !Index::exists(&*dir)? {
return self.create(dir); return self.create(dir);
} }
let index = Index::open(dir)?; let index = Index::open(dir)?;
@@ -166,8 +165,7 @@ impl IndexBuilder {
/// Creates a new index given an implementation of the trait `Directory`. /// Creates a new index given an implementation of the trait `Directory`.
/// ///
/// If a directory previously existed, it will be erased. /// If a directory previously existed, it will be erased.
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> { fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
let dir = dir.into();
let directory = ManagedDirectory::wrap(dir)?; let directory = ManagedDirectory::wrap(dir)?;
save_new_metas( save_new_metas(
self.get_expect_schema()?, self.get_expect_schema()?,
@@ -200,7 +198,7 @@ impl Index {
/// Examines the directory to see if it contains an index. /// Examines the directory to see if it contains an index.
/// ///
/// Effectively, it only checks for the presence of the `meta.json` file. /// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> { pub fn exists<Dir: Directory>(dir: &Dir) -> Result<bool, OpenReadError> {
dir.exists(&META_FILEPATH) dir.exists(&META_FILEPATH)
} }
@@ -217,7 +215,7 @@ impl Index {
/// Replace the default single thread search executor pool /// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads. /// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> { pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?); self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")?);
Ok(()) Ok(())
} }
@@ -252,11 +250,7 @@ impl Index {
} }
/// Opens or creates a new index in the provided directory /// Opens or creates a new index in the provided directory
pub fn open_or_create<T: Into<Box<dyn Directory>>>( pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
dir: T,
schema: Schema,
) -> crate::Result<Index> {
let dir = dir.into();
IndexBuilder::new().schema(schema).open_or_create(dir) IndexBuilder::new().schema(schema).open_or_create(dir)
} }
@@ -276,12 +270,11 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`. /// Creates a new index given an implementation of the trait `Directory`.
/// ///
/// If a directory previously existed, it will be erased. /// If a directory previously existed, it will be erased.
pub fn create<T: Into<Box<dyn Directory>>>( pub fn create<Dir: Directory>(
dir: T, dir: Dir,
schema: Schema, schema: Schema,
settings: IndexSettings, settings: IndexSettings,
) -> crate::Result<Index> { ) -> crate::Result<Index> {
let dir: Box<dyn Directory> = dir.into();
let mut builder = IndexBuilder::new().schema(schema); let mut builder = IndexBuilder::new().schema(schema);
builder = builder.settings(settings); builder = builder.settings(settings);
builder.create(dir) builder.create(dir)
@@ -372,8 +365,7 @@ impl Index {
} }
/// Open the index using the provided directory /// Open the index using the provided directory
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> { pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
let directory = directory.into();
let directory = ManagedDirectory::wrap(directory)?; let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default(); let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?; let metas = load_metas(&directory, &inventory)?;
@@ -403,7 +395,9 @@ impl Index {
/// ///
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`. /// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument` ///
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads( pub fn writer_with_num_threads(
&self, &self,
num_threads: usize, num_threads: usize,
@@ -445,13 +439,14 @@ impl Index {
/// Creates a multithreaded writer /// Creates a multithreaded writer
/// ///
/// Tantivy will automatically define the number of threads to use, but /// Tantivy will automatically define the number of threads to use, but
/// no more than 8 threads. /// no more than [`MAX_NUM_THREAD`] threads.
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split /// `overall_heap_size_in_bytes` is the total target memory usage that will be split
/// between a given number of threads. /// between a given number of threads.
/// ///
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument` /// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> { pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD); let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
@@ -582,15 +577,15 @@ mod tests {
#[test] #[test]
fn test_index_exists() { fn test_index_exists() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create()); let directory = RamDirectory::create();
assert!(!Index::exists(directory.as_ref()).unwrap()); assert!(!Index::exists(&directory).unwrap());
assert!(Index::create( assert!(Index::create(
directory.clone(), directory.clone(),
throw_away_schema(), throw_away_schema(),
IndexSettings::default() IndexSettings::default()
) )
.is_ok()); .is_ok());
assert!(Index::exists(directory.as_ref()).unwrap()); assert!(Index::exists(&directory).unwrap());
} }
#[test] #[test]
@@ -603,27 +598,27 @@ mod tests {
#[test] #[test]
fn open_or_create_should_open() { fn open_or_create_should_open() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create()); let directory = RamDirectory::create();
assert!(Index::create( assert!(Index::create(
directory.clone(), directory.clone(),
throw_away_schema(), throw_away_schema(),
IndexSettings::default() IndexSettings::default()
) )
.is_ok()); .is_ok());
assert!(Index::exists(directory.as_ref()).unwrap()); assert!(Index::exists(&directory).unwrap());
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok()); assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
} }
#[test] #[test]
fn create_should_wipeoff_existing() { fn create_should_wipeoff_existing() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create()); let directory = RamDirectory::create();
assert!(Index::create( assert!(Index::create(
directory.clone(), directory.clone(),
throw_away_schema(), throw_away_schema(),
IndexSettings::default() IndexSettings::default()
) )
.is_ok()); .is_ok());
assert!(Index::exists(directory.as_ref()).unwrap()); assert!(Index::exists(&directory).unwrap());
assert!(Index::create( assert!(Index::create(
directory, directory,
Schema::builder().build(), Schema::builder().build(),
@@ -657,7 +652,7 @@ mod tests {
} }
#[test] #[test]
fn test_index_on_commit_reload_policy() -> crate::Result<()> { fn test_index_on_commit_reload_policy() {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -667,7 +662,7 @@ mod tests {
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &index, &reader) test_index_on_commit_reload_policy_aux(field, &index, &reader);
} }
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
@@ -679,7 +674,7 @@ mod tests {
use tempfile::TempDir; use tempfile::TempDir;
#[test] #[test]
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> { fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new().unwrap(); let tempdir = TempDir::new().unwrap();
@@ -691,7 +686,7 @@ mod tests {
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &index, &reader) test_index_on_commit_reload_policy_aux(field, &index, &reader);
} }
#[test] #[test]
@@ -706,7 +701,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()?;
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?; writer.add_document(doc!(field=>1u64));
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = index.directory_mut().watch(WatchCallback::new(move || { let _handle = index.directory_mut().watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
@@ -720,7 +715,7 @@ mod tests {
} }
#[test] #[test]
fn test_index_on_commit_reload_policy_different_directories() -> crate::Result<()> { fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new().unwrap(); let tempdir = TempDir::new().unwrap();
@@ -733,14 +728,10 @@ mod tests {
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &write_index, &reader) test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux( fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
field: Field,
index: &Index,
reader: &IndexReader,
) -> crate::Result<()> {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index let _watch_handle = reader_index
@@ -748,9 +739,9 @@ mod tests {
.watch(WatchCallback::new(move || { .watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
let mut writer = index.writer_for_tests()?; let mut writer = index.writer_for_tests().unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?; writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
// We need a loop here because it is possible for notify to send more than // We need a loop here because it is possible for notify to send more than
// one modify event. It was observed on CI on MacOS. // one modify event. It was observed on CI on MacOS.
@@ -760,7 +751,7 @@ mod tests {
break; break;
} }
} }
writer.add_document(doc!(field=>2u64))?; writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
// ... Same as above // ... Same as above
loop { loop {
@@ -769,37 +760,37 @@ mod tests {
break; break;
} }
} }
Ok(())
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows
// prevent deleting files that are MMapped. // prevent deleting files that are MMapped.
#[cfg(not(target_os = "windows"))] #[cfg(not(target_os = "windows"))]
#[test] #[test]
fn garbage_collect_works_as_intended() -> crate::Result<()> { fn garbage_collect_works_as_intended() {
let directory = RamDirectory::create(); let directory = RamDirectory::create();
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?; let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap(); let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
for i in 0u64..8_000u64 { for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i))?; writer.add_document(doc!(field => i));
} }
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = directory.watch(WatchCallback::new(move || { let _handle = directory.watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
writer.commit()?; writer.commit().unwrap();
let mem_right_after_commit = directory.total_mem_usage(); let mem_right_after_commit = directory.total_mem_usage();
assert!(receiver.recv().is_ok()); assert!(receiver.recv().is_ok());
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 8_000); assert_eq!(reader.searcher().num_docs(), 8_000);
writer.wait_merging_threads()?; writer.wait_merging_threads().unwrap();
let mem_right_after_merge_finished = directory.total_mem_usage(); let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap(); reader.reload().unwrap();
@@ -811,6 +802,5 @@ mod tests {
mem_right_after_merge_finished, mem_right_after_merge_finished,
mem_right_after_commit mem_right_after_commit
); );
Ok(())
} }
} }

View File

@@ -2,7 +2,7 @@ use super::SegmentComponent;
use crate::schema::Schema; use crate::schema::Schema;
use crate::Opstamp; use crate::Opstamp;
use crate::{core::SegmentId, store::Compressor}; use crate::{core::SegmentId, store::Compressor};
use crate::{Inventory, TrackedObject}; use census::{Inventory, TrackedObject};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::path::PathBuf; use std::path::PathBuf;
use std::{collections::HashSet, sync::atomic::AtomicBool}; use std::{collections::HashSet, sync::atomic::AtomicBool};
@@ -189,10 +189,6 @@ impl SegmentMeta {
#[doc(hidden)] #[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta { pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
assert!(
num_deleted_docs <= self.max_doc(),
"There cannot be more deleted docs than there are docs."
);
let delete_meta = DeleteMeta { let delete_meta = DeleteMeta {
num_deleted_docs, num_deleted_docs,
opstamp, opstamp,
@@ -398,7 +394,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!( assert_eq!(
json, json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"# r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
); );
} }
} }

View File

@@ -14,7 +14,7 @@ pub use self::index_meta::{
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory, IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
}; };
pub use self::inverted_index_reader::InvertedIndexReader; pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::{Searcher, SearcherGeneration}; pub use self::searcher::Searcher;
pub use self::segment::Segment; pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent; pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId; pub use self::segment_id::SegmentId;

View File

@@ -1,5 +1,6 @@
use crate::collector::Collector; use crate::collector::Collector;
use crate::core::Executor; use crate::core::Executor;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::query::Query; use crate::query::Query;
use crate::schema::Document; use crate::schema::Document;
@@ -9,62 +10,9 @@ use crate::space_usage::SearcherSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::DocAddress; use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::Opstamp;
use crate::SegmentId;
use crate::TrackedObject;
use std::collections::BTreeMap;
use std::{fmt, io}; use std::{fmt, io};
/// Identifies the searcher generation accessed by a [Searcher].
///
/// While this might seem redundant, a [SearcherGeneration] contains
/// both a `generation_id` AND a list of `(SegmentId, DeleteOpstamp)`.
///
/// This is on purpose. This object is used by the `Warmer` API.
/// Having both information makes it possible to identify which
/// artifact should be refreshed or garbage collected.
///
/// Depending on the use case, `Warmer`'s implementers can decide to
/// produce artifacts per:
/// - `generation_id` (e.g. some searcher level aggregates)
/// - `(segment_id, delete_opstamp)` (e.g. segment level aggregates)
/// - `segment_id` (e.g. for immutable document level information)
/// - `(generation_id, segment_id)` (e.g. for consistent dynamic column)
/// - ...
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct SearcherGeneration {
segments: BTreeMap<SegmentId, Option<Opstamp>>,
generation_id: u64,
}
impl SearcherGeneration {
pub(crate) fn from_segment_readers(
segment_readers: &[SegmentReader],
generation_id: u64,
) -> Self {
let mut segment_id_to_del_opstamp = BTreeMap::new();
for segment_reader in segment_readers {
segment_id_to_del_opstamp
.insert(segment_reader.segment_id(), segment_reader.delete_opstamp());
}
Self {
segments: segment_id_to_del_opstamp,
generation_id,
}
}
/// Returns the searcher generation id.
pub fn generation_id(&self) -> u64 {
self.generation_id
}
/// Return a `(SegmentId -> DeleteOpstamp)` mapping.
pub fn segments(&self) -> &BTreeMap<SegmentId, Option<Opstamp>> {
&self.segments
}
}
/// Holds a list of `SegmentReader`s ready for search. /// Holds a list of `SegmentReader`s ready for search.
/// ///
/// It guarantees that the `Segment` will not be removed before /// It guarantees that the `Segment` will not be removed before
@@ -75,7 +23,6 @@ pub struct Searcher {
index: Index, index: Index,
segment_readers: Vec<SegmentReader>, segment_readers: Vec<SegmentReader>,
store_readers: Vec<StoreReader>, store_readers: Vec<StoreReader>,
generation: TrackedObject<SearcherGeneration>,
} }
impl Searcher { impl Searcher {
@@ -84,7 +31,6 @@ impl Searcher {
schema: Schema, schema: Schema,
index: Index, index: Index,
segment_readers: Vec<SegmentReader>, segment_readers: Vec<SegmentReader>,
generation: TrackedObject<SearcherGeneration>,
) -> io::Result<Searcher> { ) -> io::Result<Searcher> {
let store_readers: Vec<StoreReader> = segment_readers let store_readers: Vec<StoreReader> = segment_readers
.iter() .iter()
@@ -95,7 +41,6 @@ impl Searcher {
index, index,
segment_readers, segment_readers,
store_readers, store_readers,
generation,
}) })
} }
@@ -104,11 +49,6 @@ impl Searcher {
&self.index &self.index
} }
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
pub fn generation(&self) -> &SearcherGeneration {
self.generation.as_ref()
}
/// Fetches a document from tantivy's store given a `DocAddress`. /// Fetches a document from tantivy's store given a `DocAddress`.
/// ///
/// The searcher uses the segment ordinal to route the /// The searcher uses the segment ordinal to route the
@@ -148,7 +88,7 @@ impl Searcher {
&self.segment_readers &self.segment_readers
} }
/// Returns the segment_reader associated with the given segment_ord /// Returns the segment_reader associated with the given segment_ordinal
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader { pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize] &self.segment_readers[segment_ord as usize]
} }

View File

@@ -5,8 +5,7 @@ use crate::core::SegmentId;
use crate::directory::CompositeFile; use crate::directory::CompositeFile;
use crate::directory::FileSlice; use crate::directory::FileSlice;
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::intersect_alive_bitsets; use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
@@ -17,7 +16,6 @@ use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
use crate::DocId; use crate::DocId;
use crate::Opstamp;
use fail::fail_point; use fail::fail_point;
use std::fmt; use std::fmt;
use std::sync::Arc; use std::sync::Arc;
@@ -39,8 +37,6 @@ pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>, inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId, segment_id: SegmentId,
delete_opstamp: Option<Opstamp>,
max_doc: DocId, max_doc: DocId,
num_docs: DocId, num_docs: DocId,
@@ -51,7 +47,7 @@ pub struct SegmentReader {
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
store_file: FileSlice, store_file: FileSlice,
alive_bitset_opt: Option<AliveBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema, schema: Schema,
} }
@@ -76,12 +72,14 @@ impl SegmentReader {
/// Return the number of documents that have been /// Return the number of documents that have been
/// deleted in the segment. /// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId { pub fn num_deleted_docs(&self) -> DocId {
self.max_doc - self.num_docs self.delete_bitset()
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
} }
/// Returns true iff some of the documents of the segment have been deleted. /// Returns true iff some of the documents of the segment have been deleted.
pub fn has_deletes(&self) -> bool { pub fn has_deletes(&self) -> bool {
self.num_deleted_docs() > 0 self.delete_bitset().is_some()
} }
/// Accessor to a segment's fast field reader given a field. /// Accessor to a segment's fast field reader given a field.
@@ -103,7 +101,7 @@ impl SegmentReader {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() { match field_entry.field_type() {
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
let term_ords_reader = self.fast_fields().u64s(field)?; let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict = self let termdict = self
.termdict_composite .termdict_composite
@@ -130,17 +128,13 @@ impl SegmentReader {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
"Field norm not found for field {:?}. Was the field set to record norm during indexing?", "Field norm not found for field {:?}. Was it marked as indexed during indexing?",
field_name field_name
); );
crate::TantivyError::SchemaError(err_msg) crate::TantivyError::SchemaError(err_msg)
}) })
} }
pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
&self.fieldnorm_readers
}
/// Accessor to the segment's `StoreReader`. /// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> io::Result<StoreReader> { pub fn get_store_reader(&self) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone()) StoreReader::open(self.store_file.clone())
@@ -148,14 +142,6 @@ impl SegmentReader {
/// Open a new segment for reading. /// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> { pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
Self::open_with_custom_alive_set(segment, None)
}
/// Open a new segment for reading.
pub fn open_with_custom_alive_set(
segment: &Segment,
custom_bitset: Option<AliveBitSet>,
) -> crate::Result<SegmentReader> {
let termdict_file = segment.open_read(SegmentComponent::Terms)?; let termdict_file = segment.open_read(SegmentComponent::Terms)?;
let termdict_composite = CompositeFile::open(&termdict_file)?; let termdict_composite = CompositeFile::open(&termdict_file)?;
@@ -180,37 +166,29 @@ impl SegmentReader {
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?; let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers = let fast_field_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite)); Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let original_bitset = if segment.meta().has_deletes() { let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?; let delete_data = segment.open_read(SegmentComponent::Delete)?;
let delete_data = delete_file_slice.read_bytes()?; let delete_bitset = DeleteBitSet::open(delete_data)?;
Some(AliveBitSet::open(delete_data)) Some(delete_bitset)
} else { } else {
None None
}; };
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
let max_doc = segment.meta().max_doc();
let num_docs = alive_bitset_opt
.as_ref()
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
.unwrap_or(max_doc);
Ok(SegmentReader { Ok(SegmentReader {
inv_idx_reader_cache: Default::default(), inv_idx_reader_cache: Default::default(),
num_docs, max_doc: segment.meta().max_doc(),
max_doc, num_docs: segment.meta().num_docs(),
termdict_composite, termdict_composite,
postings_composite, postings_composite,
fast_fields_readers: fast_field_readers, fast_fields_readers: fast_field_readers,
fieldnorm_readers, fieldnorm_readers,
segment_id: segment.id(), segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(),
store_file, store_file,
alive_bitset_opt, delete_bitset_opt,
positions_composite, positions_composite,
schema, schema,
}) })
@@ -294,32 +272,23 @@ impl SegmentReader {
self.segment_id self.segment_id
} }
/// Returns the delete opstamp
pub fn delete_opstamp(&self) -> Option<Opstamp> {
self.delete_opstamp
}
/// Returns the bitset representing /// Returns the bitset representing
/// the documents that have been deleted. /// the documents that have been deleted.
pub fn alive_bitset(&self) -> Option<&AliveBitSet> { pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
self.alive_bitset_opt.as_ref() self.delete_bitset_opt.as_ref()
} }
/// Returns true iff the `doc` is marked /// Returns true iff the `doc` is marked
/// as deleted. /// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool { pub fn is_deleted(&self, doc: DocId) -> bool {
self.alive_bitset() self.delete_bitset()
.map(|delete_set| delete_set.is_deleted(doc)) .map(|delete_set| delete_set.is_deleted(doc))
.unwrap_or(false) .unwrap_or(false)
} }
/// Returns an iterator that will iterate over the alive document ids /// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> { pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
if let Some(alive_bitset) = &self.alive_bitset_opt { (0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
Box::new(alive_bitset.iter_alive())
} else {
Box::new(0u32..self.max_doc)
}
} }
/// Summarize total space usage of this segment. /// Summarize total space usage of this segment.
@@ -332,29 +301,14 @@ impl SegmentReader {
self.fast_fields_readers.space_usage(), self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(), self.fieldnorm_readers.space_usage(),
self.get_store_reader()?.space_usage(), self.get_store_reader()?.space_usage(),
self.alive_bitset_opt self.delete_bitset_opt
.as_ref() .as_ref()
.map(AliveBitSet::space_usage) .map(DeleteBitSet::space_usage)
.unwrap_or(0), .unwrap_or(0),
)) ))
} }
} }
fn intersect_alive_bitset(
left_opt: Option<AliveBitSet>,
right_opt: Option<AliveBitSet>,
) -> Option<AliveBitSet> {
match (left_opt, right_opt) {
(Some(left), Some(right)) => {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
Some(intersect_alive_bitsets(left, right))
}
(Some(left), None) => Some(left),
(None, Some(right)) => Some(right),
(None, None) => None,
}
}
impl fmt::Debug for SegmentReader { impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id) write!(f, "SegmentReader({:?})", self.segment_id)
@@ -377,10 +331,10 @@ mod test {
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?; index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"))?; index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"))?; index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"))?; index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs // we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse")); index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap")); index_writer.delete_term(Term::from_field_text(name, "cap"));
@@ -403,10 +357,10 @@ mod test {
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?; index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"))?; index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"))?; index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"))?; index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs // we should now have one segment with two docs
index_writer.commit()?; index_writer.commit()?;
} }

View File

@@ -43,8 +43,10 @@ impl RetryPolicy {
} }
/// The `DirectoryLock` is an object that represents a file lock. /// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
/// ///
/// It is associated to a lock file, that gets deleted on `Drop.` /// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is released automatically on `Drop`.
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>); pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
struct DirectoryLockGuard { struct DirectoryLockGuard {
@@ -140,16 +142,10 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// Opens a writer for the *virtual file* associated with /// Opens a writer for the *virtual file* associated with
/// a Path. /// a Path.
/// ///
/// Right after this call, for the span of the execution of the program /// Right after this call, the file should be created
/// the file should be created and any subsequent call to `open_read` for the /// and any subsequent call to `open_read` for the
/// same path should return a `FileSlice`. /// same path should return a `FileSlice`.
/// ///
/// However, depending on the directory implementation,
/// it might be required to call `sync_directory` to ensure
/// that the file is durably created.
/// (The semantics here are the same when dealing with
/// a posix filesystem.)
///
/// Write operations may be aggressively buffered. /// Write operations may be aggressively buffered.
/// The client of this trait is responsible for calling flush /// The client of this trait is responsible for calling flush
/// to ensure that subsequent `read` operations /// to ensure that subsequent `read` operations
@@ -180,12 +176,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// The file may or may not previously exist. /// The file may or may not previously exist.
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>; fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Sync the directory.
///
/// This call is required to ensure that newly created files are
/// effectively stored durably.
fn sync_directory(&self) -> io::Result<()>;
/// Acquire a lock in the given directory. /// Acquire a lock in the given directory.
/// ///
/// The method is blocking or not depending on the `Lock` object. /// The method is blocking or not depending on the `Lock` object.
@@ -240,15 +230,3 @@ where
Box::new(self.clone()) Box::new(self.clone())
} }
} }
impl Clone for Box<dyn Directory> {
fn clone(&self) -> Self {
self.box_clone()
}
}
impl<T: Directory + 'static> From<T> for Box<dyn Directory> {
fn from(t: T) -> Self {
Box::new(t)
}
}

View File

@@ -7,8 +7,8 @@ use std::path::PathBuf;
/// [`LockParams`](./enum.LockParams.html). /// [`LockParams`](./enum.LockParams.html).
/// Tantivy itself uses only two locks but client application /// Tantivy itself uses only two locks but client application
/// can use the directory facility to define their own locks. /// can use the directory facility to define their own locks.
/// - [INDEX_WRITER_LOCK] /// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
/// - [META_LOCK] /// - [META_LOCK](./struct.META_LOCK.html)
/// ///
/// Check out these locks documentation for more information. /// Check out these locks documentation for more information.
/// ///

View File

@@ -39,16 +39,6 @@ pub enum OpenDirectoryError {
}, },
} }
impl OpenDirectoryError {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self {
Self::IoError {
io_error,
directory_path,
}
}
}
/// Error that may occur when starting to write in a file /// Error that may occur when starting to write in a file
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum OpenWriteError { pub enum OpenWriteError {

View File

@@ -66,7 +66,6 @@ impl FileSlice {
/// Wraps a FileHandle. /// Wraps a FileHandle.
#[doc(hidden)] #[doc(hidden)]
#[must_use]
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self { pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
FileSlice { FileSlice {
data: Arc::from(file_handle), data: Arc::from(file_handle),

View File

@@ -43,16 +43,14 @@ impl FileWatcher {
thread::Builder::new() thread::Builder::new()
.name("thread-tantivy-meta-file-watcher".to_string()) .name("thread-tantivy-meta-file-watcher".to_string())
.spawn(move || { .spawn(move || {
let mut current_checksum_opt = None; let mut current_checksum = None;
while state.load(Ordering::SeqCst) == 1 { while state.load(Ordering::SeqCst) == 1 {
if let Ok(checksum) = FileWatcher::compute_checksum(&path) { if let Ok(checksum) = FileWatcher::compute_checksum(&path) {
let metafile_has_changed = current_checksum_opt // `None.unwrap_or_else(|| !checksum) != checksum` evaluates to `true`
.map(|current_checksum| current_checksum != checksum) if current_checksum.unwrap_or_else(|| !checksum) != checksum {
.unwrap_or(true);
if metafile_has_changed {
info!("Meta file {:?} was modified", path); info!("Meta file {:?} was modified", path);
current_checksum_opt = Some(checksum); current_checksum = Some(checksum);
futures::executor::block_on(callbacks.broadcast()); futures::executor::block_on(callbacks.broadcast());
} }
} }

View File

@@ -64,7 +64,7 @@ fn save_managed_paths(
impl ManagedDirectory { impl ManagedDirectory {
/// Wraps a directory as managed directory. /// Wraps a directory as managed directory.
pub fn wrap(directory: Box<dyn Directory>) -> crate::Result<ManagedDirectory> { pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) { match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => { Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data); let managed_files_json = String::from_utf8_lossy(&data);
@@ -76,14 +76,14 @@ impl ManagedDirectory {
) )
})?; })?;
Ok(ManagedDirectory { Ok(ManagedDirectory {
directory, directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation { meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files, managed_paths: managed_files,
})), })),
}) })
} }
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory { Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
directory, directory: Box::new(directory),
meta_informations: Arc::default(), meta_informations: Arc::default(),
}), }),
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()), io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
@@ -192,7 +192,6 @@ impl ManagedDirectory {
for delete_file in &deleted_files { for delete_file in &deleted_files {
managed_paths_write.remove(delete_file); managed_paths_write.remove(delete_file);
} }
self.directory.sync_directory()?;
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?; save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
} }
@@ -223,22 +222,9 @@ impl ManagedDirectory {
.write() .write()
.expect("Managed file lock poisoned"); .expect("Managed file lock poisoned");
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if !has_changed { if has_changed {
return Ok(()); save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
} }
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
// This is not the first file we add.
// Therefore, we are sure that `.managed.json` has been already
// properly created and we do not need to sync its parent directory.
//
// (It might seem like a nicer solution to create the managed_json on the
// creation of the ManagedDirectory instance but it would actually
// prevent the use of read-only directories..)
let managed_file_definitely_already_exists = meta_wlock.managed_paths.len() > 1;
if managed_file_definitely_already_exists {
return Ok(());
}
self.directory.sync_directory()?;
Ok(()) Ok(())
} }
@@ -324,11 +310,6 @@ impl Directory for ManagedDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.directory.watch(watch_callback) self.directory.watch(watch_callback)
} }
fn sync_directory(&self) -> io::Result<()> {
self.directory.sync_directory()?;
Ok(())
}
} }
impl Clone for ManagedDirectory { impl Clone for ManagedDirectory {
@@ -359,7 +340,7 @@ mod tests_mmap_specific {
let test_path2: &'static Path = Path::new("some_path_for_test_2"); let test_path2: &'static Path = Path::new("some_path_for_test_2");
{ {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let write_file = managed_directory.open_write(test_path1).unwrap(); let write_file = managed_directory.open_write(test_path1).unwrap();
write_file.terminate().unwrap(); write_file.terminate().unwrap();
managed_directory managed_directory
@@ -374,7 +355,7 @@ mod tests_mmap_specific {
} }
{ {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
assert!(managed_directory.exists(test_path1).unwrap()); assert!(managed_directory.exists(test_path1).unwrap());
assert!(!managed_directory.exists(test_path2).unwrap()); assert!(!managed_directory.exists(test_path2).unwrap());
let living_files: HashSet<PathBuf> = HashSet::new(); let living_files: HashSet<PathBuf> = HashSet::new();
@@ -393,7 +374,7 @@ mod tests_mmap_specific {
let living_files = HashSet::new(); let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut write = managed_directory.open_write(test_path1).unwrap(); let mut write = managed_directory.open_write(test_path1).unwrap();
write.write_all(&[0u8, 1u8]).unwrap(); write.write_all(&[0u8, 1u8]).unwrap();
write.terminate().unwrap(); write.terminate().unwrap();

View File

@@ -74,12 +74,20 @@ pub struct CacheInfo {
pub mmapped: Vec<PathBuf>, pub mmapped: Vec<PathBuf>,
} }
#[derive(Default)]
struct MmapCache { struct MmapCache {
counters: CacheCounters, counters: CacheCounters,
cache: HashMap<PathBuf, WeakArcBytes>, cache: HashMap<PathBuf, WeakArcBytes>,
} }
impl Default for MmapCache {
fn default() -> MmapCache {
MmapCache {
counters: CacheCounters::default(),
cache: HashMap::new(),
}
}
}
impl MmapCache { impl MmapCache {
fn get_info(&self) -> CacheInfo { fn get_info(&self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect(); let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
@@ -193,19 +201,16 @@ impl MmapDirectory {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> { pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref(); let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() { if !directory_path.exists() {
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from( Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
directory_path, directory_path,
))); )))
} } else if !directory_path.is_dir() {
let canonical_path: PathBuf = directory_path.canonicalize().map_err(|io_err| { Err(OpenDirectoryError::NotADirectory(PathBuf::from(
OpenDirectoryError::wrap_io_error(io_err, PathBuf::from(directory_path))
})?;
if !canonical_path.is_dir() {
return Err(OpenDirectoryError::NotADirectory(PathBuf::from(
directory_path, directory_path,
))); )))
} else {
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
} }
Ok(MmapDirectory::new(canonical_path, None))
} }
/// Joins a relative_path to the directory `root_path` /// Joins a relative_path to the directory `root_path`
@@ -214,6 +219,33 @@ impl MmapDirectory {
self.inner.root_path.join(relative_path) self.inner.root_path.join(relative_path)
} }
/// Sync the root directory.
/// In certain FS, this is required to persistently create
/// a file.
fn sync_directory(&self) -> Result<(), io::Error> {
let mut open_opts = OpenOptions::new();
// Linux needs read to be set, otherwise returns EINVAL
// write must not be set, or it fails with EISDIR
open_opts.read(true);
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
// and calling sync_all() only works if write access is requested.
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::um::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.inner.root_path)?;
fd.sync_all()?;
Ok(())
}
/// Returns some statistical information /// Returns some statistical information
/// about the Mmap cache. /// about the Mmap cache.
/// ///
@@ -264,7 +296,8 @@ impl Write for SafeFileWriter {
} }
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
Ok(()) self.0.flush()?;
self.0.sync_all()
} }
} }
@@ -276,9 +309,7 @@ impl Seek for SafeFileWriter {
impl TerminatingWrite for SafeFileWriter { impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.0.flush()?; self.flush()
self.0.sync_data()?;
Ok(())
} }
} }
@@ -308,7 +339,6 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?; let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
tempfile.write_all(content)?; tempfile.write_all(content)?;
tempfile.flush()?; tempfile.flush()?;
tempfile.as_file_mut().sync_data()?;
tempfile.into_temp_path().persist(path)?; tempfile.into_temp_path().persist(path)?;
Ok(()) Ok(())
} }
@@ -343,17 +373,22 @@ impl Directory for MmapDirectory {
/// removed before the file is deleted. /// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
fs::remove_file(&full_path).map_err(|e| { match fs::remove_file(&full_path) {
if e.kind() == io::ErrorKind::NotFound { Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
DeleteError::FileDoesNotExist(path.to_owned()) io_error: e,
} else { filepath: path.to_path_buf(),
DeleteError::IoError { }),
io_error: e, Err(e) => {
filepath: path.to_path_buf(), if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(DeleteError::IoError {
io_error: e,
filepath: path.to_path_buf(),
})
} }
} }
})?; }
Ok(())
} }
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> { fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
@@ -382,13 +417,10 @@ impl Directory for MmapDirectory {
file.flush() file.flush()
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?; .map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
// Note we actually do not sync the parent directory here. // Apparetntly, on some filesystem syncing the parent
// // directory is required.
// A newly created file, may, in some case, be created and even flushed to disk. self.sync_directory()
// and then lost... .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?;
//
// The file will only be durably written after we terminate AND
// sync_directory() is called.
let writer = SafeFileWriter::new(file); let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer))) Ok(BufWriter::new(Box::new(writer)))
@@ -418,7 +450,7 @@ impl Directory for MmapDirectory {
debug!("Atomic Write {:?}", path); debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
atomic_write(&full_path, content)?; atomic_write(&full_path, content)?;
Ok(()) self.sync_directory()
} }
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> { fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
@@ -444,30 +476,6 @@ impl Directory for MmapDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self.inner.watch(watch_callback)) Ok(self.inner.watch(watch_callback))
} }
fn sync_directory(&self) -> Result<(), io::Error> {
let mut open_opts = OpenOptions::new();
// Linux needs read to be set, otherwise returns EINVAL
// write must not be set, or it fails with EISDIR
open_opts.read(true);
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
// and calling sync_all() only works if write access is requested.
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::um::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.inner.root_path)?;
fd.sync_data()?;
Ok(())
}
} }
#[cfg(test)] #[cfg(test)]
@@ -574,8 +582,8 @@ mod tests {
} }
#[test] #[test]
fn test_mmap_released() -> crate::Result<()> { fn test_mmap_released() {
let mmap_directory = MmapDirectory::create_from_tempdir()?; let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let mut schema_builder: SchemaBuilder = Schema::builder(); let mut schema_builder: SchemaBuilder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -584,30 +592,31 @@ mod tests {
let index = let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap(); Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default(); let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3); log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));
for _num_commits in 0..10 { for _num_commits in 0..10 {
for _ in 0..10 { for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"abc"))?; index_writer.add_document(doc!(text_field=>"abc"));
} }
index_writer.commit()?; index_writer.commit().unwrap();
} }
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
for _ in 0..4 { for _ in 0..4 {
index_writer.add_document(doc!(text_field=>"abc"))?; index_writer.add_document(doc!(text_field=>"abc"));
index_writer.commit()?; index_writer.commit().unwrap();
reader.reload()?; reader.reload().unwrap();
} }
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads().unwrap();
reader.reload()?; reader.reload().unwrap();
let num_segments = reader.searcher().segment_readers().len(); let num_segments = reader.searcher().segment_readers().len();
assert!(num_segments <= 4); assert!(num_segments <= 4);
let num_components_except_deletes_and_tempstore = let num_components_except_deletes_and_tempstore =
@@ -618,6 +627,5 @@ mod tests {
); );
} }
assert!(mmap_directory.get_cache_info().mmapped.is_empty()); assert!(mmap_directory.get_cache_info().mmapped.is_empty());
Ok(())
} }
} }

View File

@@ -1,6 +1,6 @@
/*! /*!
WORM (Write Once Read Many) directory abstraction. WORM directory abstraction.
*/ */

View File

@@ -18,6 +18,13 @@ use super::FileHandle;
/// Writer associated with the `RamDirectory` /// Writer associated with the `RamDirectory`
/// ///
/// The Writer just writes a buffer. /// The Writer just writes a buffer.
///
/// # Panics
///
/// On drop, if the writer was left in a *dirty* state.
/// That is, if flush was not called after the last call
/// to write.
///
struct VecWriter { struct VecWriter {
path: PathBuf, path: PathBuf,
shared_directory: RamDirectory, shared_directory: RamDirectory,
@@ -39,7 +46,7 @@ impl VecWriter {
impl Drop for VecWriter { impl Drop for VecWriter {
fn drop(&mut self) { fn drop(&mut self) {
if !self.is_flushed { if !self.is_flushed {
warn!( panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.", "You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
self.path self.path
) )
@@ -214,8 +221,14 @@ impl Directory for RamDirectory {
} }
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
)));
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
self.fs.write().unwrap().write(path_buf, data); self.fs.write().unwrap().write(path_buf, data);
if path == *META_FILEPATH { if path == *META_FILEPATH {
let _ = self.fs.write().unwrap().watch_router.broadcast(); let _ = self.fs.write().unwrap().watch_router.broadcast();
} }
@@ -225,10 +238,6 @@ impl Directory for RamDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self.fs.write().unwrap().watch(watch_callback)) Ok(self.fs.write().unwrap().watch(watch_callback))
} }
fn sync_directory(&self) -> io::Result<()> {
Ok(())
}
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -118,6 +118,15 @@ mod ram_directory_tests {
} }
} }
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let test_path: &'static Path = Path::new("some_path_for_test");
let ram_directory = RamDirectory::create();
let mut write_file = ram_directory.open_write(test_path).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &dyn Directory) -> crate::Result<()> { fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test"); let test_path: &'static Path = Path::new("some_path_for_test");
let mut write_file = directory.open_write(test_path)?; let mut write_file = directory.open_write(test_path)?;

View File

@@ -1,4 +1,4 @@
use crate::fastfield::AliveBitSet; use crate::fastfield::DeleteBitSet;
use crate::DocId; use crate::DocId;
use std::borrow::Borrow; use std::borrow::Borrow;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
@@ -85,11 +85,11 @@ pub trait DocSet: Send {
/// Returns the number documents matching. /// Returns the number documents matching.
/// Calling this method consumes the `DocSet`. /// Calling this method consumes the `DocSet`.
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let mut count = 0u32; let mut count = 0u32;
let mut doc = self.doc(); let mut doc = self.doc();
while doc != TERMINATED { while doc != TERMINATED {
if alive_bitset.is_alive(doc) { if !delete_bitset.is_deleted(doc) {
count += 1u32; count += 1u32;
} }
doc = self.advance(); doc = self.advance();
@@ -130,8 +130,8 @@ impl<'a> DocSet for &'a mut dyn DocSet {
(**self).size_hint() (**self).size_hint()
} }
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
(**self).count(alive_bitset) (**self).count(delete_bitset)
} }
fn count_including_deleted(&mut self) -> u32 { fn count_including_deleted(&mut self) -> u32 {
@@ -160,9 +160,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint() unboxed.size_hint()
} }
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(alive_bitset) unboxed.count(delete_bitset)
} }
fn count_including_deleted(&mut self) -> u32 { fn count_including_deleted(&mut self) -> u32 {

View File

@@ -1,224 +0,0 @@
use crate::space_usage::ByteCount;
use crate::DocId;
use common::intersect_bitsets;
use common::BitSet;
use common::ReadOnlyBitSet;
use ownedbytes::OwnedBytes;
use std::io;
use std::io::Write;
/// Write a alive `BitSet`
///
/// where `alive_bitset` is the set of alive `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
alive_bitset.serialize(writer)?;
Ok(())
}
/// Set of alive `DocId`s.
#[derive(Clone)]
pub struct AliveBitSet {
num_alive_docs: usize,
bitset: ReadOnlyBitSet,
}
/// Intersects two AliveBitSets in a new one.
/// The two bitsets need to have the same max_value.
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
let bitset = intersect_bitsets(left.bitset(), right.bitset());
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
}
impl AliveBitSet {
#[cfg(test)]
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_full(max_doc);
for &doc in deleted_docs {
bitset.remove(doc);
}
let mut alive_bitset_buffer = Vec::new();
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
Self::open(alive_bitset_bytes)
}
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
let readonly_bitset = ReadOnlyBitSet::from(bitset);
AliveBitSet::from(readonly_bitset)
}
/// Opens a delete bitset given its file.
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let bitset = ReadOnlyBitSet::open(bytes);
AliveBitSet::from(bitset)
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
#[inline]
pub fn is_alive(&self, doc: DocId) -> bool {
self.bitset.contains(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
!self.is_alive(doc)
}
/// Iterate over the alive doc_ids.
#[inline]
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
self.bitset.iter()
}
/// Get underlying bitset
#[inline]
pub fn bitset(&self) -> &ReadOnlyBitSet {
&self.bitset
}
/// The number of deleted docs
pub fn num_alive_docs(&self) -> usize {
self.num_alive_docs
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.bitset().num_bytes()
}
}
impl From<ReadOnlyBitSet> for AliveBitSet {
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
}
}
#[cfg(test)]
mod tests {
use super::AliveBitSet;
#[test]
fn test_alive_bitset_empty() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
assert!(!alive_bitset.is_deleted(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 10);
}
#[test]
fn test_alive_bitset() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
assert!(alive_bitset.is_alive(0));
assert!(alive_bitset.is_deleted(1));
assert!(alive_bitset.is_alive(2));
assert!(alive_bitset.is_alive(3));
assert!(alive_bitset.is_alive(4));
assert!(alive_bitset.is_alive(5));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(7));
assert!(alive_bitset.is_alive(8));
assert!(alive_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 8);
}
#[test]
fn test_alive_bitset_iter_minimal() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
}
#[test]
fn test_alive_bitset_iter_small() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_alive_bitset_iter() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::AliveBitSet;
use rand::prelude::IteratorRandom;
use rand::thread_rng;
use test::Bencher;
fn get_alive() -> Vec<u32> {
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
for _ in 0..(1_000_000) * 1 / 8 {
remove_rand(&mut data);
}
data
}
fn remove_rand(raw: &mut Vec<u32>) {
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
raw.remove(i);
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
}

View File

@@ -18,11 +18,11 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]))?; index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(bytes_field=>vec![]))?; index_writer.add_document(doc!(bytes_field=>vec![]));
index_writer.add_document(doc!(bytes_field=>vec![255u8]))?; index_writer.add_document(doc!(bytes_field=>vec![255u8]));
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]))?; index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]))?; index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
@@ -47,7 +47,7 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
field => b"tantivy".as_ref(), field => b"tantivy".as_ref(),
field => b"lucene".as_ref() field => b"lucene".as_ref()
))?; ));
index_writer.commit()?; index_writer.commit()?;
Ok(index.reader()?.searcher()) Ok(index.reader()?.searcher())
} }

144
src/fastfield/delete.rs Normal file
View File

@@ -0,0 +1,144 @@
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::WritePtr;
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::HasLen;
use std::io;
use std::io::Write;
/// Write a delete `BitSet`
///
/// where `delete_bitset` is the set of deleted `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_delete_bitset(
delete_bitset: &BitSet,
max_doc: u32,
writer: &mut WritePtr,
) -> io::Result<()> {
let mut byte = 0u8;
let mut shift = 0u8;
for doc in 0..max_doc {
if delete_bitset.contains(doc) {
byte |= 1 << shift;
}
if shift == 7 {
writer.write_all(&[byte])?;
shift = 0;
byte = 0;
} else {
shift += 1;
}
}
if max_doc % 8 > 0 {
writer.write_all(&[byte])?;
}
Ok(())
}
/// Set of deleted `DocId`s.
#[derive(Clone)]
pub struct DeleteBitSet {
data: OwnedBytes,
num_deleted: usize,
}
impl DeleteBitSet {
#[cfg(test)]
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
use std::path::Path;
assert!(docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value(max_doc);
for &doc in docs {
bitset.insert(doc);
}
let directory = RamDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
wrt.terminate().unwrap();
let file = directory.open_read(path).unwrap();
Self::open(file).unwrap()
}
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
let bytes = file.read_bytes()?;
let num_deleted: usize = bytes
.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
Ok(DeleteBitSet {
data: bytes,
num_deleted,
})
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
pub fn is_alive(&self, doc: DocId) -> bool {
!self.is_deleted(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = self.data.as_slice()[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.num_deleted
}
}
#[cfg(test)]
mod tests {
use super::DeleteBitSet;
use common::HasLen;
#[test]
fn test_delete_bitset_empty() {
let delete_bitset = DeleteBitSet::for_test(&[], 10);
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 0);
}
#[test]
fn test_delete_bitset() {
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10);
assert!(delete_bitset.is_alive(0));
assert!(delete_bitset.is_deleted(1));
assert!(delete_bitset.is_alive(2));
assert!(delete_bitset.is_alive(3));
assert!(delete_bitset.is_alive(4));
assert!(delete_bitset.is_alive(5));
assert!(delete_bitset.is_alive(6));
assert!(delete_bitset.is_alive(6));
assert!(delete_bitset.is_alive(7));
assert!(delete_bitset.is_alive(8));
assert!(delete_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 2);
}
}

View File

@@ -84,18 +84,18 @@ impl FacetReader {
mod tests { mod tests {
use crate::Index; use crate::Index;
use crate::{ use crate::{
schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED}, schema::{Facet, FacetOptions, SchemaBuilder, Value, INDEXED, STORED},
DocAddress, Document, DocAddress, Document,
}; };
#[test] #[test]
fn test_facet_only_indexed() -> crate::Result<()> { fn test_facet_only_indexed() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher
@@ -106,19 +106,42 @@ mod tests {
facet_reader.facet_ords(0u32, &mut facet_ords); facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]); assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::facet); let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, None); assert_eq!(value, None);
Ok(()) Ok(())
} }
#[test]
fn test_facet_only_stored() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, Some("/a/b".to_string()));
Ok(())
}
#[test] #[test]
fn test_facet_stored_and_indexed() -> crate::Result<()> { fn test_facet_stored_and_indexed() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", STORED); let facet_field = schema_builder.add_facet_field("facet", STORED | INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher
@@ -129,20 +152,43 @@ mod tests {
facet_reader.facet_ords(0u32, &mut facet_ords); facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]); assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet); let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); assert_eq!(value, Some("/a/b".to_string()));
Ok(())
}
#[test]
fn test_facet_neither_stored_and_indexed() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, None);
Ok(()) Ok(())
} }
#[test] #[test]
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> { fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher
@@ -160,12 +206,12 @@ mod tests {
#[test] #[test]
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> { fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher

View File

@@ -23,10 +23,9 @@ values stored.
Read access performance is comparable to that of an array lookup. Read access performance is comparable to that of an array lookup.
*/ */
pub use self::alive_bitset::intersect_alive_bitsets;
pub use self::alive_bitset::write_alive_bitset;
pub use self::alive_bitset::AliveBitSet;
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader; pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
@@ -47,8 +46,8 @@ use crate::{
schema::Type, schema::Type,
}; };
mod alive_bitset;
mod bytes; mod bytes;
mod delete;
mod error; mod error;
mod facet_reader; mod facet_reader;
mod multivalued; mod multivalued;
@@ -110,7 +109,7 @@ impl FastValue for u64 {
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> { fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type { match *field_type {
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(), FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues), FieldType::HierarchicalFacet(_) => Some(Cardinality::MultiValues),
_ => None, _ => None,
} }
} }
@@ -497,18 +496,18 @@ mod tests {
} }
#[test] #[test]
fn test_merge_missing_date_fast_field() -> crate::Result<()> { fn test_merge_missing_date_fast_field() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?; index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let segment_ids: Vec<SegmentId> = reader let segment_ids: Vec<SegmentId> = reader
.searcher() .searcher()
.segment_readers() .segment_readers()
@@ -517,10 +516,10 @@ mod tests {
.collect(); .collect();
assert_eq!(segment_ids.len(), 2); assert_eq!(segment_ids.len(), 2);
let merge_future = index_writer.merge(&segment_ids[..]); let merge_future = index_writer.merge(&segment_ids[..]);
futures::executor::block_on(merge_future)?; let merge_res = futures::executor::block_on(merge_future);
reader.reload()?; assert!(merge_res.is_ok());
assert!(reader.reload().is_ok());
assert_eq!(reader.searcher().segment_readers().len(), 1); assert_eq!(reader.searcher().segment_readers().len(), 1);
Ok(())
} }
#[test] #[test]
@@ -529,7 +528,7 @@ mod tests {
} }
#[test] #[test]
fn test_datefastfield() -> crate::Result<()> { fn test_datefastfield() {
use crate::fastfield::FastValue; use crate::fastfield::FastValue;
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
@@ -539,22 +538,22 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()), date_field => crate::DateTime::from_u64(1i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()), multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(3i64.to_u64()) multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(4i64.to_u64()) date_field => crate::DateTime::from_u64(4i64.to_u64())
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()), multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(6i64.to_u64()) multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
))?; ));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
@@ -581,7 +580,6 @@ mod tests {
assert_eq!(dates[0].timestamp(), 5i64); assert_eq!(dates[0].timestamp(), 5i64);
assert_eq!(dates[1].timestamp(), 6i64); assert_eq!(dates[1].timestamp(), 6i64);
} }
Ok(())
} }
} }

View File

@@ -12,9 +12,9 @@ mod tests {
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::Cardinality; use crate::schema::Cardinality;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::FacetOptions;
use crate::schema::IntOptions; use crate::schema::IntOptions;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::INDEXED;
use crate::Document; use crate::Document;
use crate::Index; use crate::Index;
use crate::Term; use crate::Term;
@@ -23,10 +23,10 @@ mod tests {
use proptest::prop_oneof; use proptest::prop_oneof;
use proptest::proptest; use proptest::proptest;
use proptest::strategy::Strategy; use proptest::strategy::Strategy;
use test_log::test; use test_env_log::test;
#[test] #[test]
fn test_multivalued_u64() -> crate::Result<()> { fn test_multivalued_u64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field( let field = schema_builder.add_u64_field(
"multifield", "multifield",
@@ -34,17 +34,17 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64))?; index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64))?; index_writer.add_document(doc!(field=>4u64));
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64))?; index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut vals = Vec::new(); let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().u64s(field)?; let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
{ {
multi_value_reader.get_vals(2, &mut vals); multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]); assert_eq!(&vals, &[4u64]);
@@ -57,55 +57,56 @@ mod tests {
multi_value_reader.get_vals(1, &mut vals); multi_value_reader.get_vals(1, &mut vals);
assert!(vals.is_empty()); assert!(vals.is_empty());
} }
Ok(())
} }
#[test] #[test]
fn test_multivalued_date() -> crate::Result<()> { fn test_multivalued_date() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field( let date_field = schema_builder.add_date_field(
"multi_date_field", "multi_date_field",
IntOptions::default() IntOptions::default()
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_indexed() .set_indexed()
.set_fieldnorm()
.set_stored(), .set_stored(),
); );
let time_i = let time_i =
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored()); schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let first_time_stamp = chrono::Utc::now(); let first_time_stamp = chrono::Utc::now();
index_writer.add_document( index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64), doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
)?; );
index_writer.add_document(doc!(time_i=>0i64))?; index_writer.add_document(doc!(time_i=>0i64));
// add one second // add one second
index_writer.add_document( index_writer
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64), .add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
)?;
// add another second // add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2); let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?; index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
// add three seconds // add three seconds
index_writer.add_document( index_writer
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64), .add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
)?; assert!(index_writer.commit().is_ok());
index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 5); assert_eq!(reader.num_docs(), 5);
{ {
let parser = QueryParser::for_index(&index, vec![date_field]); let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?; let query = parser
let results = searcher.search(&query, &TopDocs::with_limit(5))?; .parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
for (_score, doc_address) in results { for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!( assert_eq!(
retrieved_doc retrieved_doc
.get_first(date_field) .get_first(date_field)
@@ -127,8 +128,12 @@ mod tests {
{ {
let parser = QueryParser::for_index(&index, vec![date_field]); let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?; let query = parser
let results = searcher.search(&query, &TopDocs::with_limit(5))?; .parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
@@ -160,8 +165,10 @@ mod tests {
(first_time_stamp + Duration::seconds(1)).to_rfc3339(), (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
(first_time_stamp + Duration::seconds(3)).to_rfc3339() (first_time_stamp + Duration::seconds(3)).to_rfc3339()
); );
let query = parser.parse_query(&range_q)?; let query = parser.parse_query(&range_q).expect("could not parse query");
let results = searcher.search(&query, &TopDocs::with_limit(5))?; let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 2); assert_eq!(results.len(), 2);
for (i, doc_pair) in results.iter().enumerate() { for (i, doc_pair) in results.iter().enumerate() {
@@ -189,16 +196,16 @@ mod tests {
retrieved_doc retrieved_doc
.get_first(time_i) .get_first(time_i)
.expect("cannot find value") .expect("cannot find value")
.i64_value(), .i64_value()
Some(time_i_val) .expect("value not of i64 type"),
time_i_val
); );
} }
} }
Ok(())
} }
#[test] #[test]
fn test_multivalued_i64() -> crate::Result<()> { fn test_multivalued_i64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field( let field = schema_builder.add_i64_field(
"multifield", "multifield",
@@ -206,14 +213,14 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64))?; index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64))?; index_writer.add_document(doc!(field=> -4i64));
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64))?; index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut vals = Vec::new(); let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap(); let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
@@ -225,10 +232,9 @@ mod tests {
assert!(vals.is_empty()); assert!(vals.is_empty());
multi_value_reader.get_vals(3, &mut vals); multi_value_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[-5i64, -20i64, 1i64]); assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
Ok(())
} }
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> { fn test_multivalued_no_panic(ops: &[IndexingOp]) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field( let field = schema_builder.add_u64_field(
"multifield", "multifield",
@@ -238,7 +244,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops { for &op in ops {
@@ -246,19 +252,19 @@ mod tests {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id } => {
match id % 3 { match id % 3 {
0 => { 0 => {
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
} }
1 => { 1 => {
let mut doc = Document::new(); let mut doc = Document::new();
for _ in 0..5001 { for _ in 0..5001 {
doc.add_u64(field, id as u64); doc.add_u64(field, id as u64);
} }
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
_ => { _ => {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_u64(field, id as u64); doc.add_u64(field, id as u64);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
}; };
} }
@@ -269,16 +275,18 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
IndexingOp::Merge => { IndexingOp::Merge => {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 { if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids))?; block_on(index_writer.merge(&segment_ids)).unwrap();
index_writer.segment_updater().wait_merging_thread()?; assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
} }
} }
} }
} }
index_writer.commit()?; assert!(index_writer.commit().is_ok());
// Merging the segments // Merging the segments
{ {
@@ -290,7 +298,6 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
} }
Ok(())
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
@@ -313,7 +320,7 @@ mod tests {
proptest! { proptest! {
#[test] #[test]
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) { fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_multivalued_no_panic(&ops[..]).is_ok()); test_multivalued_no_panic(&ops[..]);
} }
} }
@@ -328,22 +335,20 @@ mod tests {
Merge, Merge,
]; ];
assert!(test_multivalued_no_panic(&ops[..]).is_ok()); test_multivalued_no_panic(&ops[..]);
} }
#[test] #[test]
#[ignore] #[ignore]
fn test_many_facets() -> crate::Result<()> { fn test_many_facets() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_facet_field("facetfield", FacetOptions::default()); let field = schema_builder.add_facet_field("facetfield", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..100_000 { for i in 0..100_000 {
index_writer index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())))?;
} }
index_writer.commit()?; assert!(index_writer.commit().is_ok());
Ok(())
} }
} }

View File

@@ -91,25 +91,27 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
mod tests { mod tests {
use crate::core::Index; use crate::core::Index;
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema}; use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
#[test] #[test]
fn test_multifastfield_reader() -> crate::Result<()> { fn test_multifastfield_reader() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facets", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index
.writer_for_tests()
.expect("Failed to create index writer.");
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"), facet_field => Facet::from("/category/cat2"),
facet_field => Facet::from("/category/cat1"), facet_field => Facet::from("/category/cat1"),
))?; ));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")))?; index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")))?; index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
index_writer.commit()?; index_writer.commit().expect("Commit failed");
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field)?; let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let mut facet = Facet::root(); let mut facet = Facet::root();
{ {
@@ -143,11 +145,10 @@ mod tests {
facet_reader.facet_ords(2, &mut vals); facet_reader.facet_ords(2, &mut vals);
assert_eq!(&vals[..], &[4]); assert_eq!(&vals[..], &[4]);
} }
Ok(())
} }
#[test] #[test]
fn test_multifastfield_reader_min_max() -> crate::Result<()> { fn test_multifastfield_reader_min_max() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field_options = IntOptions::default() let field_options = IntOptions::default()
.set_indexed() .set_indexed()
@@ -162,16 +163,15 @@ mod tests {
item_field => 2i64, item_field => 2i64,
item_field => 3i64, item_field => 3i64,
item_field => -2i64, item_field => -2i64,
))?; ));
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64))?; index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
index_writer.add_document(doc!(item_field => 4i64))?; index_writer.add_document(doc!(item_field => 4i64));
index_writer.commit()?; index_writer.commit().expect("Commit failed");
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let field_reader = segment_reader.fast_fields().i64s(item_field)?; let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
assert_eq!(field_reader.min_value(), -2); assert_eq!(field_reader.min_value(), -2);
assert_eq!(field_reader.max_value(), 6); assert_eq!(field_reader.max_value(), 6);
Ok(())
} }
} }

View File

@@ -40,7 +40,7 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
FieldType::Date(options) => options FieldType::Date(options) => options
.get_fastfield_cardinality() .get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)), .map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)), FieldType::HierarchicalFacet(_) => Some((FastType::U64, Cardinality::MultiValues)),
_ => None, _ => None,
} }
} }

View File

@@ -54,7 +54,7 @@ impl FastFieldsWriter {
None => {} None => {}
} }
} }
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true); let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer); multi_values_writers.push(fast_field_writer);
} }

View File

@@ -26,137 +26,3 @@ pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter; pub use self::writer::FieldNormsWriter;
use self::code::{fieldnorm_to_id, id_to_fieldnorm}; use self::code::{fieldnorm_to_id, id_to_fieldnorm};
#[cfg(test)]
mod tests {
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::query::Query;
use crate::query::TermQuery;
use crate::schema::IndexRecordOption;
use crate::schema::TextFieldIndexing;
use crate::schema::TextOptions;
use crate::schema::TEXT;
use crate::Index;
use crate::Term;
use crate::TERMINATED;
use once_cell::sync::Lazy;
use std::path::Path;
use crate::schema::{Field, Schema, STORED};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("field", STORED);
schema_builder.add_text_field("txt_field", TEXT);
schema_builder.add_text_field(
"str_field",
TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false),
),
);
schema_builder.build()
});
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());
#[test]
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
}
#[test]
pub fn test_fieldnorm() -> crate::Result<()> {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer, None)?;
}
let file = directory.open_read(&path)?;
{
let fields_composite = CompositeFile::open(&file)?;
assert!(fields_composite.open_read(*FIELD).is_none());
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
}
Ok(())
}
#[test]
fn test_fieldnorm_disabled() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(false));
let text = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(text=>"hello"))?;
writer.add_document(doc!(text=>"hello hello hello"))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query = TermQuery::new(
Term::from_field_text(text, "hello"),
IndexRecordOption::WithFreqs,
);
let weight = query.weight(&*searcher, true)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), 1);
assert_eq!(scorer.doc(), 1);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_fieldnorm_enabled() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(true));
let text = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(text=>"hello"))?;
writer.add_document(doc!(text=>"hello hello hello"))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query = TermQuery::new(
Term::from_field_text(text, "hello"),
IndexRecordOption::WithFreqs,
);
let weight = query.weight(&*searcher, true)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), 1);
assert_eq!(scorer.doc(), 1);
assert!((scorer.score() - 0.15136132).abs() < 0.001f32);
assert_eq!(scorer.advance(), TERMINATED);
Ok(())
}
}

View File

@@ -4,7 +4,6 @@ use super::fieldnorm_to_id;
use super::FieldNormsSerializer; use super::FieldNormsSerializer;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::Schema; use crate::schema::Schema;
use std::cmp::Ordering;
use std::{io, iter}; use std::{io, iter};
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte /// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
@@ -13,7 +12,8 @@ use std::{io, iter};
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a /// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
/// byte per document per field. /// byte per document per field.
pub struct FieldNormsWriter { pub struct FieldNormsWriter {
fieldnorms_buffers: Vec<Option<Vec<u8>>>, fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>,
} }
impl FieldNormsWriter { impl FieldNormsWriter {
@@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema schema
.fields() .fields()
.filter_map(|(field, field_entry)| { .filter_map(|(field, field_entry)| {
if field_entry.is_indexed() && field_entry.has_fieldnorms() { if field_entry.is_indexed() {
Some(field) Some(field)
} else { } else {
None None
@@ -35,20 +35,25 @@ impl FieldNormsWriter {
/// Initialize with state for tracking the field norm fields /// Initialize with state for tracking the field norm fields
/// specified in the schema. /// specified in the schema.
pub fn for_schema(schema: &Schema) -> FieldNormsWriter { pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let mut fieldnorms_buffers: Vec<Option<Vec<u8>>> = iter::repeat_with(|| None) let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
.take(schema.num_fields()) let max_field = fields
.collect(); .iter()
for field in FieldNormsWriter::fields_with_fieldnorm(schema) { .map(Field::field_id)
fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000)); .max()
.map(|max_field_id| max_field_id as usize + 1)
.unwrap_or(0);
FieldNormsWriter {
fields,
fieldnorms_buffer: iter::repeat_with(Vec::new)
.take(max_field)
.collect::<Vec<_>>(),
} }
FieldNormsWriter { fieldnorms_buffers }
} }
/// The memory used inclusive childs /// The memory used inclusive childs
pub fn mem_usage(&self) -> usize { pub fn mem_usage(&self) -> usize {
self.fieldnorms_buffers self.fieldnorms_buffer
.iter() .iter()
.flatten()
.map(|buf| buf.capacity()) .map(|buf| buf.capacity())
.sum() .sum()
} }
@@ -57,10 +62,8 @@ impl FieldNormsWriter {
/// ///
/// Will extend with 0-bytes for documents that have not been seen. /// Will extend with 0-bytes for documents that have not been seen.
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) { pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() { for field in self.fields.iter() {
if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() { self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
fieldnorms_buffer.resize(max_doc as usize, 0u8);
}
} }
} }
@@ -73,23 +76,14 @@ impl FieldNormsWriter {
/// * field - the field being set /// * field - the field being set
/// * fieldnorm - the number of terms present in document `doc` in field `field` /// * fieldnorm - the number of terms present in document `doc` in field `field`
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) { pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
if let Some(fieldnorm_buffer) = self let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
.fieldnorms_buffers assert!(
.get_mut(field.field_id() as usize) fieldnorm_buffer.len() <= doc as usize,
.and_then(Option::as_mut) "Cannot register a given fieldnorm twice"
{ );
match fieldnorm_buffer.len().cmp(&(doc as usize)) { // we fill intermediary `DocId` as having a fieldnorm of 0.
Ordering::Less => { fieldnorm_buffer.resize(doc as usize + 1, 0u8);
// we fill intermediary `DocId` as having a fieldnorm of 0. fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
fieldnorm_buffer.resize(doc as usize, 0u8);
}
Ordering::Equal => {}
Ordering::Greater => {
panic!("Cannot register a given fieldnorm twice")
}
}
fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm));
}
} }
/// Serialize the seen fieldnorm values to the serializer for all fields. /// Serialize the seen fieldnorm values to the serializer for all fields.
@@ -98,18 +92,17 @@ impl FieldNormsWriter {
mut fieldnorms_serializer: FieldNormsSerializer, mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map( for &field in self.fields.iter() {
|(field_id, fieldnorms_buffer_opt)| { let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
(Field::from_field_id(field_id as u32), fieldnorms_buffer)
})
},
) {
if let Some(doc_id_map) = doc_id_map { if let Some(doc_id_map) = doc_id_map {
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer); let mut mapped_fieldnorm_values = vec![];
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?; mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
}
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
} else { } else {
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
} }
} }
fieldnorms_serializer.close()?; fieldnorms_serializer.close()?;

View File

@@ -49,7 +49,7 @@ fn test_functional_store() -> crate::Result<()> {
} }
for _ in 0..num_docs { for _ in 0..num_docs {
doc_set.push(doc_id); doc_set.push(doc_id);
index_writer.add_document(doc!(id_field=>doc_id))?; index_writer.add_document(doc!(id_field=>doc_id));
doc_id += 1; doc_id += 1;
} }
index_writer.commit()?; index_writer.commit()?;
@@ -124,7 +124,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
doc.add_u64(multiples_field, random_val * i); doc.add_u64(multiples_field, random_val * i);
} }
doc.add_text(text_field, get_text()); doc.add_text(text_field, get_text());
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
} }
Ok(()) Ok(())
@@ -201,7 +201,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
doc.add_u64(multiples_field, random_val * i); doc.add_u64(multiples_field, random_val * i);
} }
doc.add_text(text_field, get_text()); doc.add_text(text_field, get_text());
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
} }
Ok(()) Ok(())

View File

@@ -1,324 +0,0 @@
use common::BitSet;
use itertools::Itertools;
use crate::fastfield::AliveBitSet;
use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal};
/// DemuxMapping can be used to reorganize data from multiple segments.
///
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant.
/// It allows to reorganize documents as follows:
///
/// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with
/// the documents (simplified)
/// Seg 1 [TENANT_A, TENANT_B]
/// Seg 2 [TENANT_A, TENANT_B]
///
/// You may want to group your documents to
/// Seg 1 [TENANT_A, TENANT_A]
/// Seg 2 [TENANT_B, TENANT_B]
///
/// Demuxing is the tool for that.
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal].
#[derive(Debug, Default)]
pub struct DemuxMapping {
/// [index old segment ordinal] -> [index doc_id] = new segment ordinal
mapping: Vec<DocIdToSegmentOrdinal>,
}
/// DocIdToSegmentOrdinal maps from doc_id within a segment to the new segment ordinal for demuxing.
///
/// For every source segment there is a `DocIdToSegmentOrdinal` to distribute its doc_ids.
#[derive(Debug, Default)]
pub struct DocIdToSegmentOrdinal {
doc_id_index_to_segment_ord: Vec<SegmentOrdinal>,
}
impl DocIdToSegmentOrdinal {
/// Creates a new DocIdToSegmentOrdinal with size of num_doc_ids.
/// Initially all doc_ids point to segment ordinal 0 and need to be set
/// the via `set` method.
pub fn with_max_doc(max_doc: usize) -> Self {
DocIdToSegmentOrdinal {
doc_id_index_to_segment_ord: vec![0; max_doc],
}
}
/// Returns the number of documents in this mapping.
/// It should be equal to the `max_doc` of the segment it targets.
pub fn max_doc(&self) -> u32 {
self.doc_id_index_to_segment_ord.len() as u32
}
/// Associates a doc_id with an output `SegmentOrdinal`.
pub fn set(&mut self, doc_id: u32, segment_ord: SegmentOrdinal) {
self.doc_id_index_to_segment_ord[doc_id as usize] = segment_ord;
}
/// Iterates over the new SegmentOrdinal in the order of the doc_id.
pub fn iter(&self) -> impl Iterator<Item = SegmentOrdinal> + '_ {
self.doc_id_index_to_segment_ord.iter().cloned()
}
}
impl DemuxMapping {
/// Adds a DocIdToSegmentOrdinal. The order of the pus calls
/// defines the old segment ordinal. e.g. first push = ordinal 0.
pub fn add(&mut self, segment_mapping: DocIdToSegmentOrdinal) {
self.mapping.push(segment_mapping);
}
/// Returns the old number of segments.
pub fn get_old_num_segments(&self) -> usize {
self.mapping.len()
}
}
fn docs_for_segment_ord(
doc_id_to_segment_ord: &DocIdToSegmentOrdinal,
target_segment_ord: SegmentOrdinal,
) -> AliveBitSet {
let mut bitset = BitSet::with_max_value(doc_id_to_segment_ord.max_doc());
for doc_id in doc_id_to_segment_ord
.iter()
.enumerate()
.filter(|(_doc_id, new_segment_ord)| *new_segment_ord == target_segment_ord)
.map(|(doc_id, _)| doc_id)
{
// add document if segment ordinal = target segment ordinal
bitset.insert(doc_id as u32);
}
AliveBitSet::from_bitset(&bitset)
}
fn get_alive_bitsets(
demux_mapping: &DemuxMapping,
target_segment_ord: SegmentOrdinal,
) -> Vec<AliveBitSet> {
demux_mapping
.mapping
.iter()
.map(|doc_id_to_segment_ord| {
docs_for_segment_ord(doc_id_to_segment_ord, target_segment_ord)
})
.collect_vec()
}
/// Demux the segments according to `demux_mapping`. See `DemuxMapping`.
/// The number of output_directories need to match max new segment ordinal from `demux_mapping`.
///
/// The ordinal of `segments` need to match the ordinals provided in `demux_mapping`.
pub fn demux(
segments: &[Segment],
demux_mapping: &DemuxMapping,
target_settings: IndexSettings,
output_directories: Vec<Box<dyn Directory>>,
) -> crate::Result<Vec<Index>> {
let mut indices = vec![];
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
.into_iter()
.map(Some)
.collect_vec();
let index = merge_filtered_segments(
segments,
target_settings.clone(),
delete_bitsets,
output_directory,
)?;
indices.push(index);
}
Ok(indices)
}
#[cfg(test)]
mod tests {
use crate::{
collector::TopDocs,
directory::RamDirectory,
query::QueryParser,
schema::{Schema, TEXT},
DocAddress, Term,
};
use super::*;
#[test]
fn test_demux_map_to_deletebitset() {
let max_value = 2;
let mut demux_mapping = DemuxMapping::default();
//segment ordinal 0 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 0);
demux_mapping.add(doc_id_to_segment);
//segment ordinal 1 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 1);
demux_mapping.add(doc_id_to_segment);
{
let bit_sets_for_demuxing_to_segment_ord_0 = get_alive_bitsets(&demux_mapping, 0);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(0),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(1),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(0),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(1),
true
);
}
{
let bit_sets_for_demuxing_to_segment_ord_1 = get_alive_bitsets(&demux_mapping, 1);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(0),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(1),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(0),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(1),
false
);
}
}
#[test]
fn test_demux_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"texto1"))?;
index_writer.add_document(doc!(text_field=>"texto2"))?;
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"texto3"))?;
index_writer.add_document(doc!(text_field=>"texto4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
segments.extend(second_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let mut demux_mapping = DemuxMapping::default();
{
let max_value = 2;
//segment ordinal 0 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 0);
demux_mapping.add(doc_id_to_segment);
//segment ordinal 1 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 1);
demux_mapping.add(doc_id_to_segment);
}
assert_eq!(demux_mapping.get_old_num_segments(), 2);
let demuxed_indices = demux(
&segments,
&demux_mapping,
target_settings,
vec![
Box::new(RamDirectory::default()),
Box::new(RamDirectory::default()),
],
)?;
{
let index = &demuxed_indices[0];
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 1);
let searcher = index.reader().unwrap().searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("texto1"), vec![] as Vec<u32>);
assert_eq!(do_search("texto2"), vec![0]);
}
}
{
let index = &demuxed_indices[1];
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 3);
let searcher = index.reader().unwrap().searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("texto1"), vec![0]);
assert_eq!(do_search("texto2"), vec![] as Vec<u32>);
assert_eq!(do_search("texto3"), vec![1]);
assert_eq!(do_search("texto4"), vec![2]);
}
}
Ok(())
}
}

View File

@@ -2,23 +2,23 @@
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting //! to get mappings from old doc_id to new doc_id and vice versa, after sorting
//! //!
use super::SegmentWriter; use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
use crate::{ use crate::{
schema::{Field, Schema}, schema::{Field, Schema},
DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError, DocId, IndexSortByField, Order, TantivyError,
}; };
use std::{cmp::Reverse, ops::Index}; use std::{cmp::Reverse, ops::Index};
/// Struct to provide mapping from new doc_id to old doc_id and segment. /// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct SegmentDocIdMapping { pub(crate) struct SegmentDocidMapping<'a> {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>, new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
is_trivial: bool, is_trivial: bool,
} }
impl SegmentDocIdMapping { impl<'a> SegmentDocidMapping<'a> {
pub(crate) fn new( pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>, new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
is_trivial: bool, is_trivial: bool,
) -> Self { ) -> Self {
Self { Self {
@@ -26,7 +26,7 @@ impl SegmentDocIdMapping {
is_trivial, is_trivial,
} }
} }
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentOrdinal)> { pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
self.new_doc_id_to_old_and_segment.iter() self.new_doc_id_to_old_and_segment.iter()
} }
pub(crate) fn len(&self) -> usize { pub(crate) fn len(&self) -> usize {
@@ -40,15 +40,15 @@ impl SegmentDocIdMapping {
self.is_trivial self.is_trivial
} }
} }
impl Index<usize> for SegmentDocIdMapping { impl<'a> Index<usize> for SegmentDocidMapping<'a> {
type Output = (DocId, SegmentOrdinal); type Output = (DocId, SegmentReaderWithOrdinal<'a>);
fn index(&self, idx: usize) -> &Self::Output { fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx] &self.new_doc_id_to_old_and_segment[idx]
} }
} }
impl IntoIterator for SegmentDocIdMapping { impl<'a> IntoIterator for SegmentDocidMapping<'a> {
type Item = (DocId, SegmentOrdinal); type Item = (DocId, SegmentReaderWithOrdinal<'a>);
type IntoIter = std::vec::IntoIter<Self::Item>; type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter { fn into_iter(self) -> Self::IntoIter {
@@ -63,24 +63,6 @@ pub struct DocIdMapping {
} }
impl DocIdMapping { impl DocIdMapping {
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
let max_doc = new_doc_id_to_old.len();
let old_max_doc = new_doc_id_to_old
.iter()
.cloned()
.max()
.map(|n| n + 1)
.unwrap_or(0);
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
for i in 0..max_doc {
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
}
DocIdMapping {
new_doc_id_to_old,
old_doc_id_to_new,
}
}
/// returns the new doc_id for the old doc_id /// returns the new doc_id for the old doc_id
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId { pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
self.old_doc_id_to_new[doc_id as usize] self.old_doc_id_to_new[doc_id as usize]
@@ -93,13 +75,6 @@ impl DocIdMapping {
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ { pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned() self.new_doc_id_to_old.iter().cloned()
} }
/// Remaps a given array to the new doc ids.
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
self.new_doc_id_to_old
.iter()
.map(|old_doc| els[*old_doc as usize])
.collect()
}
} }
pub(crate) fn expect_field_id_for_sort_field( pub(crate) fn expect_field_id_for_sort_field(
@@ -147,13 +122,23 @@ pub(crate) fn get_doc_id_mapping_from_field(
.into_iter() .into_iter()
.map(|el| el.0) .map(|el| el.0)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
// create old doc_id to new doc_id index (used in posting recorder)
let max_doc = new_doc_id_to_old.len();
let mut old_doc_id_to_new = vec![0; max_doc];
for i in 0..max_doc {
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
}
let doc_id_map = DocIdMapping {
new_doc_id_to_old,
old_doc_id_to_new,
};
Ok(doc_id_map)
} }
#[cfg(test)] #[cfg(test)]
mod tests_indexsorting { mod tests_indexsorting {
use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldReader;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::{collector::TopDocs, query::QueryParser, schema::*}; use crate::{collector::TopDocs, query::QueryParser, schema::*};
use crate::{schema::Schema, DocAddress}; use crate::{schema::Schema, DocAddress};
use crate::{Index, IndexSettings, IndexSortByField, Order}; use crate::{Index, IndexSettings, IndexSortByField, Order};
@@ -161,7 +146,7 @@ mod tests_indexsorting {
fn create_test_index( fn create_test_index(
index_settings: Option<IndexSettings>, index_settings: Option<IndexSettings>,
text_field_options: TextOptions, text_field_options: TextOptions,
) -> crate::Result<Index> { ) -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let my_text_field = schema_builder.add_text_field("text_field", text_field_options); let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
@@ -181,20 +166,19 @@ mod tests_indexsorting {
if let Some(settings) = index_settings { if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings); index_builder = index_builder.settings(settings);
} }
let index = index_builder.create_in_ram()?; let index = index_builder.create_in_ram().unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(my_number=>40_u64))?; index_writer.add_document(doc!(my_number=>40_u64));
index_writer.add_document( index_writer
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64), .add_document(doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64));
)?; index_writer.add_document(doc!(my_number=>100_u64));
index_writer.add_document(doc!(my_number=>100_u64))?;
index_writer.add_document( index_writer.add_document(
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"), doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
)?; );
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?; index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ));
index_writer.commit()?; index_writer.commit().unwrap();
Ok(index) index
} }
fn get_text_options() -> TextOptions { fn get_text_options() -> TextOptions {
TextOptions::default().set_indexing_options( TextOptions::default().set_indexing_options(
@@ -219,7 +203,7 @@ mod tests_indexsorting {
for option in options { for option in options {
//let options = get_text_options(); //let options = get_text_options();
// no index_sort // no index_sort
let index = create_test_index(None, option.clone())?; let index = create_test_index(None, option.clone());
let my_text_field = index.schema().get_field("text_field").unwrap(); let my_text_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -241,7 +225,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
option.clone(), option.clone(),
)?; );
let my_text_field = index.schema().get_field("text_field").unwrap(); let my_text_field = index.schema().get_field("text_field").unwrap();
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -273,7 +257,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
option.clone(), option.clone(),
)?; );
let my_string_field = index.schema().get_field("text_field").unwrap(); let my_string_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -303,7 +287,7 @@ mod tests_indexsorting {
#[test] #[test]
fn test_sort_index_get_documents() -> crate::Result<()> { fn test_sort_index_get_documents() -> crate::Result<()> {
// default baseline // default baseline
let index = create_test_index(None, get_text_options())?; let index = create_test_index(None, get_text_options());
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
@@ -332,7 +316,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
@@ -357,7 +341,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
@@ -372,7 +356,7 @@ mod tests_indexsorting {
#[test] #[test]
fn test_sort_index_test_string_field() -> crate::Result<()> { fn test_sort_index_test_string_field() -> crate::Result<()> {
let index = create_test_index(None, get_text_options())?; let index = create_test_index(None, get_text_options());
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -392,7 +376,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -423,7 +407,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -459,7 +443,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
assert_eq!( assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field, index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string() "my_number".to_string()
@@ -490,27 +474,4 @@ mod tests_indexsorting {
assert_eq!(vals, &[3]); assert_eq!(vals, &[3]);
Ok(()) Ok(())
} }
#[test]
fn test_doc_mapping() {
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
}
#[test]
fn test_doc_mapping_remap() {
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
assert_eq!(
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
&[2000, 8000, 3000]
);
}
} }

View File

@@ -11,10 +11,9 @@ use crate::directory::TerminatingWrite;
use crate::directory::{DirectoryLock, GarbageCollectionResult}; use crate::directory::{DirectoryLock, GarbageCollectionResult};
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::error::TantivyError; use crate::error::TantivyError;
use crate::fastfield::write_alive_bitset; use crate::fastfield::write_delete_bitset;
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue}; use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping; use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
use crate::indexer::index_writer_status::IndexWriterStatus;
use crate::indexer::operation::DeleteOperation; use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper; use crate::indexer::stamper::Stamper;
use crate::indexer::MergePolicy; use crate::indexer::MergePolicy;
@@ -29,13 +28,13 @@ use crossbeam::channel;
use futures::executor::block_on; use futures::executor::block_on;
use futures::future::Future; use futures::future::Future;
use smallvec::smallvec; use smallvec::smallvec;
use smallvec::SmallVec;
use std::mem;
use std::ops::Range; use std::ops::Range;
use std::sync::Arc; use std::sync::Arc;
use std::thread; use std::thread;
use std::thread::JoinHandle; use std::thread::JoinHandle;
use super::{AddBatch, AddBatchReceiver, AddBatchSender};
// Size of the margin for the heap. A segment is closed when the remaining memory // Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES. // in the heap goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: usize = 1_000_000; pub const MARGIN_IN_BYTES: usize = 1_000_000;
@@ -51,12 +50,15 @@ pub const MAX_NUM_THREAD: usize = 8;
// reaches `PIPELINE_MAX_SIZE_IN_DOCS` // reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
fn error_in_index_worker_thread(context: &str) -> TantivyError { // Group of operations.
TantivyError::ErrorInThread(format!( // Most of the time, users will send operation one-by-one, but it can be useful to
"{}. A worker thread encounterred an error (io::Error most likely) or panicked.", // send them as a small block to ensure that
context // - all docs in the operation will happen on the same segment and continuous doc_ids.
)) // - all operations in the group are committed at the same time, making the group
} // atomic.
type OperationGroup = SmallVec<[AddOperation; 4]>;
type OperationSender = channel::Sender<OperationGroup>;
type OperationReceiver = channel::Receiver<OperationGroup>;
/// `IndexWriter` is the user entry-point to add document to an index. /// `IndexWriter` is the user entry-point to add document to an index.
/// ///
@@ -75,8 +77,8 @@ pub struct IndexWriter {
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>, workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
index_writer_status: IndexWriterStatus, operation_receiver: OperationReceiver,
operation_sender: AddBatchSender, operation_sender: OperationSender,
segment_updater: SegmentUpdater, segment_updater: SegmentUpdater,
@@ -91,7 +93,7 @@ pub struct IndexWriter {
} }
fn compute_deleted_bitset( fn compute_deleted_bitset(
alive_bitset: &mut BitSet, delete_bitset: &mut BitSet,
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor, delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping, doc_opstamps: &DocToOpstampMapping,
@@ -112,7 +114,7 @@ fn compute_deleted_bitset(
let mut doc_matching_deleted_term = docset.doc(); let mut doc_matching_deleted_term = docset.doc();
while doc_matching_deleted_term != TERMINATED { while doc_matching_deleted_term != TERMINATED {
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) { if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
alive_bitset.remove(doc_matching_deleted_term); delete_bitset.insert(doc_matching_deleted_term);
might_have_changed = true; might_have_changed = true;
} }
doc_matching_deleted_term = docset.advance(); doc_matching_deleted_term = docset.advance();
@@ -139,7 +141,7 @@ pub(crate) fn advance_deletes(
return Ok(()); return Ok(());
} }
if segment_entry.alive_bitset().is_none() && segment_entry.delete_cursor().get().is_none() { if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
// There has been no `DeleteOperation` between the segment status and `target_opstamp`. // There has been no `DeleteOperation` between the segment status and `target_opstamp`.
return Ok(()); return Ok(());
} }
@@ -147,32 +149,38 @@ pub(crate) fn advance_deletes(
let segment_reader = SegmentReader::open(&segment)?; let segment_reader = SegmentReader::open(&segment)?;
let max_doc = segment_reader.max_doc(); let max_doc = segment_reader.max_doc();
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() { let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
Some(previous_alive_bitset) => (*previous_alive_bitset).clone(), Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
None => BitSet::with_max_value_and_full(max_doc), None => BitSet::with_max_value(max_doc),
}; };
let num_deleted_docs_before = segment.meta().num_deleted_docs(); let num_deleted_docs_before = segment.meta().num_deleted_docs();
compute_deleted_bitset( compute_deleted_bitset(
&mut alive_bitset, &mut delete_bitset,
&segment_reader, &segment_reader,
segment_entry.delete_cursor(), segment_entry.delete_cursor(),
&DocToOpstampMapping::None, &DocToOpstampMapping::None,
target_opstamp, target_opstamp,
)?; )?;
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() { // TODO optimize
alive_bitset.intersect_update(seg_alive_bitset.bitset()); // It should be possible to do something smarter by manipulation bitsets directly
// to compute this union.
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
for doc in 0u32..max_doc {
if seg_delete_bitset.is_deleted(doc) {
delete_bitset.insert(doc);
}
}
} }
let num_alive_docs: u32 = alive_bitset.len() as u32; let num_deleted_docs: u32 = delete_bitset.len() as u32;
let num_deleted_docs = max_doc - num_alive_docs;
if num_deleted_docs > num_deleted_docs_before { if num_deleted_docs > num_deleted_docs_before {
// There are new deletes. We need to write a new delete file. // There are new deletes. We need to write a new delete file.
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::Delete)?; let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
write_alive_bitset(&alive_bitset, &mut delete_file)?; write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
delete_file.terminate()?; delete_file.terminate()?;
} }
@@ -183,10 +191,10 @@ pub(crate) fn advance_deletes(
fn index_documents( fn index_documents(
memory_budget: usize, memory_budget: usize,
segment: Segment, segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>, grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
) -> crate::Result<()> { ) -> crate::Result<bool> {
let schema = segment.schema(); let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
@@ -205,7 +213,7 @@ fn index_documents(
} }
if !segment_updater.is_alive() { if !segment_updater.is_alive() {
return Ok(()); return Ok(false);
} }
let max_doc = segment_writer.max_doc(); let max_doc = segment_writer.max_doc();
@@ -218,20 +226,21 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc); let segment_with_max_doc = segment.with_max_doc(max_doc);
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?; let delete_bitset_opt =
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone(); let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore(); meta.untrack_temp_docstore();
// update segment_updater inventory to remove tempstore // update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt); let segment_entry = SegmentEntry::new(meta, delete_cursor, delete_bitset_opt);
block_on(segment_updater.schedule_add_segment(segment_entry))?; block_on(segment_updater.schedule_add_segment(segment_entry))?;
Ok(()) Ok(true)
} }
/// `doc_opstamps` is required to be non-empty. /// `doc_opstamps` is required to be non-empty.
fn apply_deletes( fn apply_deletes(
segment: &Segment, segment: &Segment,
delete_cursor: &mut DeleteCursor, mut delete_cursor: &mut DeleteCursor,
doc_opstamps: &[Opstamp], doc_opstamps: &[Opstamp],
) -> crate::Result<Option<BitSet>> { ) -> crate::Result<Option<BitSet>> {
if delete_cursor.get().is_none() { if delete_cursor.get().is_none() {
@@ -250,11 +259,11 @@ fn apply_deletes(
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps); let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc(); let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc); let mut deleted_bitset = BitSet::with_max_value(max_doc);
let may_have_deletes = compute_deleted_bitset( let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset, &mut deleted_bitset,
&segment_reader, &segment_reader,
delete_cursor, &mut delete_cursor,
&doc_to_opstamps, &doc_to_opstamps,
max_doc_opstamp, max_doc_opstamp,
)?; )?;
@@ -278,7 +287,8 @@ impl IndexWriter {
/// should work at the same time. /// should work at the same time.
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument` /// # Panics
/// If the heap size per thread is too small, panics.
pub(crate) fn new( pub(crate) fn new(
index: &Index, index: &Index,
num_threads: usize, num_threads: usize,
@@ -296,7 +306,7 @@ impl IndexWriter {
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX); let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
return Err(TantivyError::InvalidArgument(err_msg)); return Err(TantivyError::InvalidArgument(err_msg));
} }
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) = let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new(); let delete_queue = DeleteQueue::new();
@@ -314,7 +324,7 @@ impl IndexWriter {
heap_size_in_bytes_per_thread, heap_size_in_bytes_per_thread,
index: index.clone(), index: index.clone(),
index_writer_status: IndexWriterStatus::from(document_receiver), operation_receiver: document_receiver,
operation_sender: document_sender, operation_sender: document_sender,
segment_updater, segment_updater,
@@ -338,11 +348,6 @@ impl IndexWriter {
self.operation_sender = sender; self.operation_sender = sender;
} }
/// Accessor to the index.
pub fn index(&self) -> &Index {
&self.index
}
/// If there are some merging threads, blocks until they all finish their work and /// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`. /// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> crate::Result<()> { pub fn wait_merging_threads(mut self) -> crate::Result<()> {
@@ -354,14 +359,16 @@ impl IndexWriter {
for join_handle in former_workers_handles { for join_handle in former_workers_handles {
join_handle join_handle
.join() .join()
.map_err(|_| error_in_index_worker_thread("Worker thread panicked."))? .expect("Indexing Worker thread panicked")
.map_err(|_| error_in_index_worker_thread("Worker thread failed."))?; .map_err(|_| {
TantivyError::ErrorInThread("Error in indexing worker thread.".into())
})?;
} }
let result = self let result = self
.segment_updater .segment_updater
.wait_merging_thread() .wait_merging_thread()
.map_err(|_| error_in_index_worker_thread("Failed to join merging thread.")); .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
if let Err(ref e) = result { if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e); error!("Some merging thread failed {:?}", e);
@@ -389,18 +396,10 @@ impl IndexWriter {
self.index.new_segment() self.index.new_segment()
} }
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
self.index_writer_status
.operation_receiver()
.ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string()))
}
/// Spawns a new worker thread for indexing. /// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline. /// The thread consumes documents from the pipeline.
fn add_indexing_worker(&mut self) -> crate::Result<()> { fn add_indexing_worker(&mut self) -> crate::Result<()> {
let document_receiver_clone = self.operation_receiver()?; let document_receiver_clone = self.operation_receiver.clone();
let index_writer_bomb = self.index_writer_status.create_bomb();
let mut segment_updater = self.segment_updater.clone(); let mut segment_updater = self.segment_updater.clone();
let mut delete_cursor = self.delete_queue.cursor(); let mut delete_cursor = self.delete_queue.cursor();
@@ -411,31 +410,32 @@ impl IndexWriter {
.name(format!("thrd-tantivy-index{}", self.worker_id)) .name(format!("thrd-tantivy-index{}", self.worker_id))
.spawn(move || { .spawn(move || {
loop { loop {
let mut document_iterator = document_receiver_clone let mut document_iterator =
.clone() document_receiver_clone.clone().into_iter().peekable();
.into_iter()
.filter(|batch| !batch.is_empty())
.peekable();
// The peeking here is to avoid creating a new segment's files // the peeking here is to avoid
// creating a new segment's files
// if no document are available. // if no document are available.
// //
// This is a valid guarantee as the peeked document now belongs to // this is a valid guarantee as the
// peeked document now belongs to
// our local iterator. // our local iterator.
if let Some(batch) = document_iterator.peek() { if let Some(operations) = document_iterator.peek() {
assert!(!batch.is_empty()); if let Some(first) = operations.first() {
delete_cursor.skip_to(batch[0].opstamp); delete_cursor.skip_to(first.opstamp);
} else {
return Ok(());
}
} else { } else {
// No more documents. // No more documents.
// It happens when there is a commit, or if the `IndexWriter` // Happens when there is a commit, or if the `IndexWriter`
// was dropped. // was dropped.
index_writer_bomb.defuse();
return Ok(()); return Ok(());
} }
let segment = index.new_segment();
index_documents( index_documents(
mem_budget, mem_budget,
index.new_segment(), segment,
&mut document_iterator, &mut document_iterator,
&mut segment_updater, &mut segment_updater,
delete_cursor.clone(), delete_cursor.clone(),
@@ -465,8 +465,10 @@ impl IndexWriter {
} }
/// Detects and removes the files that are not used by the index anymore. /// Detects and removes the files that are not used by the index anymore.
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> { pub fn garbage_collect_files(
self.segment_updater.schedule_garbage_collect().await &self,
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
self.segment_updater.schedule_garbage_collect()
} }
/// Deletes all documents from the index /// Deletes all documents from the index
@@ -489,7 +491,7 @@ impl IndexWriter {
/// let index = Index::create_in_ram(schema.clone()); /// let index = Index::create_in_ram(schema.clone());
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?; /// index_writer.add_document(doc!(title => "The modern Promotheus"));
/// index_writer.commit()?; /// index_writer.commit()?;
/// ///
/// let clear_res = index_writer.delete_all_documents().unwrap(); /// let clear_res = index_writer.delete_all_documents().unwrap();
@@ -533,11 +535,12 @@ impl IndexWriter {
/// when no documents are remaining. /// when no documents are remaining.
/// ///
/// Returns the former segment_ready channel. /// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) { #[allow(unused_must_use)]
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) = fn recreate_document_channel(&mut self) -> OperationReceiver {
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender; mem::replace(&mut self.operation_sender, document_sender);
self.index_writer_status = IndexWriterStatus::from(document_receiver); mem::replace(&mut self.operation_receiver, document_receiver)
} }
/// Rollback to the last commit /// Rollback to the last commit
@@ -553,7 +556,7 @@ impl IndexWriter {
// marks the segment updater as killed. From now on, all // marks the segment updater as killed. From now on, all
// segment updates will be ignored. // segment updates will be ignored.
self.segment_updater.kill(); self.segment_updater.kill();
let document_receiver_res = self.operation_receiver(); let document_receiver = self.operation_receiver.clone();
// take the directory lock to create a new index_writer. // take the directory lock to create a new index_writer.
let directory_lock = self let directory_lock = self
@@ -579,9 +582,7 @@ impl IndexWriter {
// //
// This will reach an end as the only document_sender // This will reach an end as the only document_sender
// was dropped with the index_writer. // was dropped with the index_writer.
if let Ok(document_receiver) = document_receiver_res { for _ in document_receiver {}
for _ in document_receiver {}
}
Ok(self.committed_opstamp) Ok(self.committed_opstamp)
} }
@@ -695,10 +696,14 @@ impl IndexWriter {
/// The opstamp is an increasing `u64` that can /// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own /// be used by the client to align commits with its own
/// document queue. /// document queue.
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> { pub fn add_document(&self, document: Document) -> Opstamp {
let opstamp = self.stamper.stamp(); let opstamp = self.stamper.stamp();
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?; let add_operation = AddOperation { opstamp, document };
Ok(opstamp) let send_result = self.operation_sender.send(smallvec![add_operation]);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
}
opstamp
} }
/// Gets a range of stamps from the stamper and "pops" the last stamp /// Gets a range of stamps from the stamper and "pops" the last stamp
@@ -711,7 +716,11 @@ impl IndexWriter {
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) { fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
let Range { start, end } = self.stamper.stamps(count + 1u64); let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1; let last_opstamp = end - 1;
(last_opstamp, start..last_opstamp) let stamps = Range {
start,
end: last_opstamp,
};
(last_opstamp, stamps)
} }
/// Runs a group of document operations ensuring that the operations are /// Runs a group of document operations ensuring that the operations are
@@ -730,20 +739,16 @@ impl IndexWriter {
/// Like adds and deletes (see `IndexWriter.add_document` and /// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be /// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`. /// visible to readers only after calling `commit()`.
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp> pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
where let count = user_operations.len() as u64;
I: IntoIterator<Item = UserOperation>,
I::IntoIter: ExactSizeIterator,
{
let user_operations_it = user_operations.into_iter();
let count = user_operations_it.len() as u64;
if count == 0 { if count == 0 {
return Ok(self.stamper.stamp()); return self.stamper.stamp();
} }
let (batch_opstamp, stamps) = self.get_batch_opstamps(count); let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
let mut adds = AddBatch::default(); let mut adds = OperationGroup::default();
for (user_op, opstamp) in user_operations_it.zip(stamps) {
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
match user_op { match user_op {
UserOperation::Delete(term) => { UserOperation::Delete(term) => {
let delete_operation = DeleteOperation { opstamp, term }; let delete_operation = DeleteOperation { opstamp, term };
@@ -755,16 +760,12 @@ impl IndexWriter {
} }
} }
} }
self.send_add_documents_batch(adds)?; let send_result = self.operation_sender.send(adds);
Ok(batch_opstamp) if let Err(e) = send_result {
} panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
};
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> { batch_opstamp
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(())
} else {
Err(error_in_index_worker_thread("An index writer was killed."))
}
} }
} }
@@ -798,7 +799,6 @@ mod tests {
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::Cardinality; use crate::schema::Cardinality;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::FacetOptions;
use crate::schema::IntOptions; use crate::schema::IntOptions;
use crate::schema::TextFieldIndexing; use crate::schema::TextFieldIndexing;
use crate::schema::TextOptions; use crate::schema::TextOptions;
@@ -831,7 +831,7 @@ mod tests {
UserOperation::Add(doc!(text_field=>"a")), UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")), UserOperation::Add(doc!(text_field=>"b")),
]; ];
let batch_opstamp1 = index_writer.run(operations).unwrap(); let batch_opstamp1 = index_writer.run(operations);
assert_eq!(batch_opstamp1, 2u64); assert_eq!(batch_opstamp1, 2u64);
} }
@@ -842,18 +842,14 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer.add_document(doc!(text_field => "hello1"));
.add_document(doc!(text_field => "hello1")) index_writer.add_document(doc!(text_field => "hello2"));
.unwrap();
index_writer
.add_document(doc!(text_field => "hello2"))
.unwrap();
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 2); assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
index_writer.delete_term(Term::from_field_text(text_field, "hello1")); index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -861,7 +857,7 @@ mod tests {
assert!(reader.reload().is_ok()); assert!(reader.reload().is_ok());
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1); assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp(); let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
@@ -873,7 +869,7 @@ mod tests {
assert!(reader.reload().is_ok()); assert!(reader.reload().is_ok());
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1); assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp(); let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
assert_eq!(after_delete_opstamp, previous_delete_opstamp); assert_eq!(after_delete_opstamp, previous_delete_opstamp);
@@ -904,7 +900,7 @@ mod tests {
UserOperation::Delete(b_term), UserOperation::Delete(b_term),
]; ];
index_writer.run(operations).unwrap(); index_writer.run(operations);
index_writer.commit().expect("failed to commit"); index_writer.commit().expect("failed to commit");
reader.reload().expect("failed to load searchers"); reader.reload().expect("failed to load searchers");
@@ -934,10 +930,10 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap(); let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![]; let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap(); let batch_opstamp1 = index_writer.run(operations1);
assert_eq!(batch_opstamp1, 0u64); assert_eq!(batch_opstamp1, 0u64);
let operations2 = vec![]; let operations2 = vec![];
let batch_opstamp2 = index_writer.run(operations2).unwrap(); let batch_opstamp2 = index_writer.run(operations2);
assert_eq!(batch_opstamp2, 1u64); assert_eq!(batch_opstamp2, 1u64);
} }
@@ -974,7 +970,7 @@ mod tests {
assert_eq!( assert_eq!(
format!("{:?}", index_writer.get_merge_policy()), format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \ "LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }" level_log_size: 0.75 }"
); );
let merge_policy = Box::new(NoMergePolicy::default()); let merge_policy = Box::new(NoMergePolicy::default());
index_writer.set_merge_policy(merge_policy); index_writer.set_merge_policy(merge_policy);
@@ -997,14 +993,15 @@ mod tests {
} }
#[test] #[test]
fn test_commit_and_rollback() -> crate::Result<()> { fn test_commit_and_rollback() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s); let term = Term::from_field_text(text_field, s);
@@ -1013,127 +1010,136 @@ mod tests {
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer(3_000_000)?; let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback()?; index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64); assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
index_writer.add_document(doc!(text_field=>"b"))?; {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"b"));
index_writer.commit()?; index_writer.add_document(doc!(text_field=>"c"));
reader.reload()?; }
assert!(index_writer.commit().is_ok());
reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1); assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1); assert_eq!(num_docs_containing("c"), 1);
} }
reader.reload()?; reader.reload().unwrap();
reader.searcher(); reader.searcher();
Ok(())
} }
#[test] #[test]
fn test_with_merges() -> crate::Result<()> { fn test_with_merges() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
reader.searcher().doc_freq(&term_a).unwrap() reader.searcher().doc_freq(&term_a).unwrap()
}; };
// writing the segment {
let mut index_writer = index.writer(12_000_000).unwrap(); // writing the segment
// create 8 segments with 100 tiny docs let mut index_writer = index.writer(12_000_000).unwrap();
for _doc in 0..100 { // create 8 segments with 100 tiny docs
index_writer.add_document(doc!(text_field=>"a"))?; for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
}
index_writer.commit().expect("commit failed");
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8);
} }
index_writer.commit()?;
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?;
}
// this should create 8 segments and trigger a merge.
index_writer.commit()?;
index_writer.wait_merging_threads()?;
reader.reload()?;
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments()?.len() < 8);
Ok(())
} }
#[test] #[test]
fn test_prepare_with_commit_message() -> crate::Result<()> { fn test_prepare_with_commit_message() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer(12_000_000)?;
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
{
let mut prepared_commit = index_writer.prepare_commit()?;
prepared_commit.set_payload("first commit");
prepared_commit.commit()?;
}
{
let metas = index.load_metas()?;
assert_eq!(metas.payload.unwrap(), "first commit");
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
index_writer.commit()?;
{
let metas = index.load_metas()?;
assert!(metas.payload.is_none());
}
Ok(())
}
#[test]
fn test_prepare_but_rollback() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000)?; let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs // create 8 segments with 100 tiny docs
for _doc in 0..100 { for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?; index_writer.add_document(doc!(text_field => "a"));
} }
{ {
let mut prepared_commit = index_writer.prepare_commit()?; let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit"); prepared_commit.set_payload("first commit");
prepared_commit.abort()?; prepared_commit.commit().expect("commit failed");
} }
{ {
let metas = index.load_metas()?; let metas = index.load_metas().unwrap();
assert_eq!(metas.payload.unwrap(), "first commit");
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();
{
let metas = index.load_metas().unwrap();
assert!(metas.payload.is_none());
}
}
}
#[test]
fn test_prepare_but_rollback() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
prepared_commit.abort().expect("commit failed");
}
{
let metas = index.load_metas().unwrap();
assert!(metas.payload.is_none()); assert!(metas.payload.is_none());
} }
for _doc in 0..100 { for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "b"))?; index_writer.add_document(doc!(text_field => "b"));
} }
index_writer.commit()?; index_writer.commit().unwrap();
} }
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
index index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()? .try_into()
.unwrap()
.searcher() .searcher()
.doc_freq(&term_a) .doc_freq(&term_a)
.unwrap()
}; };
assert_eq!(num_docs_containing("a")?, 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b")?, 100); assert_eq!(num_docs_containing("b"), 100);
Ok(())
} }
#[test] #[test]
@@ -1154,7 +1160,7 @@ mod tests {
}; };
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap(); let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
let commit_tstamp = index_writer.commit().unwrap(); let commit_tstamp = index_writer.commit().unwrap();
assert!(commit_tstamp > add_tstamp); assert!(commit_tstamp > add_tstamp);
index_writer.delete_all_documents().unwrap(); index_writer.delete_all_documents().unwrap();
@@ -1171,7 +1177,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap(); let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
// commit documents - they are now available // commit documents - they are now available
let first_commit = index_writer.commit(); let first_commit = index_writer.commit();
@@ -1190,7 +1196,7 @@ mod tests {
// add new documents again // add new documents again
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b")).unwrap(); index_writer.add_document(doc!(text_field => "b"));
} }
// rollback to last commit, when index was empty // rollback to last commit, when index was empty
@@ -1224,7 +1230,7 @@ mod tests {
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
// add one simple doc // add one simple doc
index_writer.add_document(doc!(text_field => "a")).unwrap(); index_writer.add_document(doc!(text_field => "a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
@@ -1248,7 +1254,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// add one simple doc // add one simple doc
assert!(index_writer.add_document(doc!(text_field => "a")).is_ok()); index_writer.add_document(doc!(text_field => "a"));
let comm = index_writer.commit(); let comm = index_writer.commit();
assert!(comm.is_ok()); assert!(comm.is_ok());
let commit_tstamp = comm.unwrap(); let commit_tstamp = comm.unwrap();
@@ -1324,13 +1330,13 @@ mod tests {
// create and delete docs in same commit // create and delete docs in same commit
for id in 0u64..5u64 { for id in 0u64..5u64 {
index_writer.add_document(doc!(id_field => id))?; index_writer.add_document(doc!(id_field => id));
} }
for id in 2u64..4u64 { for id in 2u64..4u64 {
index_writer.delete_term(Term::from_field_u64(id_field, id)); index_writer.delete_term(Term::from_field_u64(id_field, id));
} }
for id in 5u64..10u64 { for id in 5u64..10u64 {
index_writer.add_document(doc!(id_field => id))?; index_writer.add_document(doc!(id_field => id));
} }
index_writer.commit()?; index_writer.commit()?;
index_reader.reload()?; index_reader.reload()?;
@@ -1358,24 +1364,15 @@ mod tests {
Merge, Merge,
} }
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> { fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![ prop_oneof![
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }), (0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }), (0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit), (0u64..2u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge), (0u64..1u64).prop_map(|_| IndexingOp::Merge),
] ]
} }
fn adding_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
10 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
}
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) { fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new(); let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new(); let mut deleted_ids = HashSet::new();
@@ -1420,7 +1417,7 @@ mod tests {
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_stored(), .set_stored(),
); );
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let settings = if sort_index { let settings = if sort_index {
IndexSettings { IndexSettings {
@@ -1442,14 +1439,12 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
let old_reader = index.reader()?;
for &op in ops { for &op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
index_writer index_writer
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM))?; .add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id)); index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1482,21 +1477,6 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
} }
old_reader.reload()?;
let old_searcher = old_reader.searcher();
let ids_old_searcher: HashSet<u64> = old_searcher
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get(doc))
})
.collect();
let ids: HashSet<u64> = searcher let ids: HashSet<u64> = searcher
.segment_readers() .segment_readers()
.iter() .iter()
@@ -1509,19 +1489,6 @@ mod tests {
.collect(); .collect();
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops); let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
let num_docs_expected = expected_ids_and_num_occurences
.iter()
.map(|(_, id_occurences)| *id_occurences as usize)
.sum::<usize>();
assert_eq!(searcher.num_docs() as usize, num_docs_expected);
assert_eq!(old_searcher.num_docs() as usize, num_docs_expected);
assert_eq!(
ids_old_searcher,
expected_ids_and_num_occurences
.keys()
.cloned()
.collect::<HashSet<_>>()
);
assert_eq!( assert_eq!(
ids, ids,
expected_ids_and_num_occurences expected_ids_and_num_occurences
@@ -1546,7 +1513,7 @@ mod tests {
for segment_reader in searcher.segment_readers().iter() { for segment_reader in searcher.segment_readers().iter() {
let store_reader = segment_reader.get_store_reader().unwrap(); let store_reader = segment_reader.get_store_reader().unwrap();
// test store iterator // test store iterator
for doc in store_reader.iter(segment_reader.alive_bitset()) { for doc in store_reader.iter(segment_reader.delete_bitset()) {
let id = doc let id = doc
.unwrap() .unwrap()
.get_first(id_field) .get_first(id_field)
@@ -1616,42 +1583,22 @@ mod tests {
} }
proptest! { proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
#[test] #[test]
fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok()); assert!(test_operation_strategy(&ops[..], true, false).is_ok());
} }
#[test] #[test]
fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok()); assert!(test_operation_strategy(&ops[..], false, false).is_ok());
} }
#[test] #[test]
fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok()); assert!(test_operation_strategy(&ops[..], true, true).is_ok());
} }
#[test] #[test]
fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok()); assert!(test_operation_strategy(&ops[..], false, true).is_ok());
} }
#[test]
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
}
#[test]
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
}
#[test]
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
}
#[test]
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
}
} }
#[test] #[test]
@@ -1676,11 +1623,11 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// We add a doc... // We add a doc...
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?; index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
// And remove it. // And remove it.
index_writer.delete_term(Term::from_field_u64(id_field, 0u64)); index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
// We add another doc. // We add another doc.
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?; index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
// The expected result is a segment with // The expected result is a segment with
// maxdoc = 2 // maxdoc = 2
@@ -1692,19 +1639,19 @@ mod tests {
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.max_doc(), 2); assert_eq!(segment_reader.max_doc(), 2);
assert_eq!(segment_reader.num_docs(), 1); assert_eq!(segment_reader.num_deleted_docs(), 1);
Ok(()) Ok(())
} }
#[test] #[test]
fn test_index_doc_missing_field() -> crate::Result<()> { fn test_index_doc_missing_field() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let idfield = schema_builder.add_text_field("id", STRING); let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING); schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid"))?; index_writer.add_document(doc!(idfield=>"myid"));
index_writer.commit()?; let commit = index_writer.commit();
Ok(()) assert!(commit.is_ok());
} }
} }

View File

@@ -1,118 +0,0 @@
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use super::AddBatchReceiver;
#[derive(Clone)]
pub(crate) struct IndexWriterStatus {
inner: Arc<Inner>,
}
impl IndexWriterStatus {
/// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive()
}
/// Returns a copy of the operation receiver.
/// If the index writer was killed, returns None.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
let rlock = self
.inner
.receive_channel
.read()
.expect("This lock should never be poisoned");
rlock.as_ref().cloned()
}
/// Create an index writer bomb.
/// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
IndexWriterBomb {
inner: Some(self.inner.clone()),
}
}
}
struct Inner {
is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver>>,
}
impl Inner {
fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed)
}
fn kill(&self) {
self.is_alive.store(false, Ordering::Relaxed);
self.receive_channel
.write()
.expect("This lock should never be poisoned")
.take();
}
}
impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver) -> Self {
IndexWriterStatus {
inner: Arc::new(Inner {
is_alive: AtomicBool::new(true),
receive_channel: RwLock::new(Some(receiver)),
}),
}
}
}
/// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner>>,
}
impl IndexWriterBomb {
/// Defuses the bomb.
///
/// This is the only way to drop the bomb without killing
/// the index writer.
pub fn defuse(mut self) {
self.inner = None;
}
}
impl Drop for IndexWriterBomb {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.kill();
}
}
}
#[cfg(test)]
mod tests {
use super::IndexWriterStatus;
use crossbeam::channel;
use std::mem;
#[test]
fn test_bomb_goes_boom() {
let (_tx, rx) = channel::bounded(10);
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
assert!(index_writer_status.operation_receiver().is_some());
let bomb = index_writer_status.create_bomb();
assert!(index_writer_status.operation_receiver().is_some());
mem::drop(bomb);
// boom!
assert!(index_writer_status.operation_receiver().is_none());
}
#[test]
fn test_bomb_defused() {
let (_tx, rx) = channel::bounded(10);
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
assert!(index_writer_status.operation_receiver().is_some());
let bomb = index_writer_status.create_bomb();
bomb.defuse();
assert!(index_writer_status.operation_receiver().is_some());
}
}

View File

@@ -2,15 +2,12 @@ use super::merge_policy::{MergeCandidate, MergePolicy};
use crate::core::SegmentMeta; use crate::core::SegmentMeta;
use itertools::Itertools; use itertools::Itertools;
use std::cmp; use std::cmp;
use std::f64;
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75; const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000; const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8; const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000; const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
// The default value of 1 means that deletes are not taken in account when
// identifying merge candidates. This is not a very sensible default: it was
// set like that for backward compatibility and might change in the near future.
const DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE: f32 = 1.0f32;
/// `LogMergePolicy` tries to merge segments that have a similar number of /// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents. /// documents.
@@ -20,7 +17,6 @@ pub struct LogMergePolicy {
max_docs_before_merge: usize, max_docs_before_merge: usize,
min_layer_size: u32, min_layer_size: u32,
level_log_size: f64, level_log_size: f64,
del_docs_ratio_before_merge: f32,
} }
impl LogMergePolicy { impl LogMergePolicy {
@@ -56,49 +52,19 @@ impl LogMergePolicy {
pub fn set_level_log_size(&mut self, level_log_size: f64) { pub fn set_level_log_size(&mut self, level_log_size: f64) {
self.level_log_size = level_log_size; self.level_log_size = level_log_size;
} }
/// Set the ratio of deleted documents in a segment to tolerate.
///
/// If it is exceeded by any segment at a log level, a merge
/// will be triggered for that level.
///
/// If there is a single segment at a level, we effectively end up expunging
/// deleted documents from it.
///
/// # Panics
///
/// Panics if del_docs_ratio_before_merge is not within (0..1].
pub fn set_del_docs_ratio_before_merge(&mut self, del_docs_ratio_before_merge: f32) {
assert!(del_docs_ratio_before_merge <= 1.0f32);
assert!(del_docs_ratio_before_merge > 0f32);
self.del_docs_ratio_before_merge = del_docs_ratio_before_merge;
}
fn has_segment_above_deletes_threshold(&self, level: &[&SegmentMeta]) -> bool {
level
.iter()
.any(|segment| deletes_ratio(segment) > self.del_docs_ratio_before_merge)
}
}
fn deletes_ratio(segment: &SegmentMeta) -> f32 {
if segment.max_doc() == 0 {
return 0f32;
}
segment.num_deleted_docs() as f32 / segment.max_doc() as f32
} }
impl MergePolicy for LogMergePolicy { impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> { fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
let size_sorted_segments = segments let mut size_sorted_segments = segments
.iter() .iter()
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32)) .filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
.collect::<Vec<&SegmentMeta>>(); .collect::<Vec<&SegmentMeta>>();
if size_sorted_segments.is_empty() { if size_sorted_segments.len() <= 1 {
return vec![]; return vec![];
} }
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
let mut current_max_log_size = f64::MAX; let mut current_max_log_size = f64::MAX;
let mut levels = vec![]; let mut levels = vec![];
@@ -116,10 +82,7 @@ impl MergePolicy for LogMergePolicy {
levels levels
.iter() .iter()
.filter(|level| { .filter(|level| level.len() >= self.min_num_segments)
level.len() >= self.min_num_segments
|| self.has_segment_above_deletes_threshold(level)
})
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect())) .map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
.collect() .collect()
} }
@@ -132,7 +95,6 @@ impl Default for LogMergePolicy {
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE, max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
min_layer_size: DEFAULT_MIN_LAYER_SIZE, min_layer_size: DEFAULT_MIN_LAYER_SIZE,
level_log_size: DEFAULT_LEVEL_LOG_SIZE, level_log_size: DEFAULT_LEVEL_LOG_SIZE,
del_docs_ratio_before_merge: DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE,
} }
} }
} }
@@ -152,7 +114,7 @@ mod tests {
use crate::Index; use crate::Index;
#[test] #[test]
fn create_index_test_max_merge_issue_1035() -> crate::Result<()> { fn create_index_test_max_merge_issue_1035() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intval", INDEXED); let int_field = schema_builder.add_u64_field("intval", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -165,34 +127,34 @@ mod tests {
log_merge_policy.set_max_docs_before_merge(1); log_merge_policy.set_max_docs_before_merge(1);
log_merge_policy.set_min_layer_size(0); log_merge_policy.set_min_layer_size(0);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));
// after every commit the merge checker is started, it will merge only segments with 1 // after every commit the merge checker is started, it will merge only segments with 1
// element in it because of the max_merge_size. // element in it because of the max_merge_size.
index_writer.add_document(doc!(int_field=>1_u64))?; index_writer.add_document(doc!(int_field=>1_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>2_u64))?; index_writer.add_document(doc!(int_field=>2_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>3_u64))?; index_writer.add_document(doc!(int_field=>3_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>4_u64))?; index_writer.add_document(doc!(int_field=>4_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64))?; index_writer.add_document(doc!(int_field=>5_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>6_u64))?; index_writer.add_document(doc!(int_field=>6_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>7_u64))?; index_writer.add_document(doc!(int_field=>7_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>8_u64))?; index_writer.add_document(doc!(int_field=>8_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let _segment_ids = index let _segment_ids = index
@@ -207,7 +169,6 @@ mod tests {
panic!("segment can't have more than two segments"); panic!("segment can't have more than two segments");
} // don't know how to wait for the merge, then it could be a simple eq } // don't know how to wait for the merge, then it could be a simple eq
} }
Ok(())
} }
fn test_merge_policy() -> LogMergePolicy { fn test_merge_policy() -> LogMergePolicy {
@@ -326,49 +287,4 @@ mod tests {
assert_eq!(result_list[0].0[1], test_input[4].id()); assert_eq!(result_list[0].0[1], test_input[4].id());
assert_eq!(result_list[0].0[2], test_input[5].id()); assert_eq!(result_list[0].0[2], test_input[5].id());
} }
#[test]
fn test_merge_single_segment_with_deletes_below_threshold() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_000, 1)];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert!(merge_candidates.is_empty());
}
#[test]
fn test_merge_single_segment_with_deletes_above_threshold() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_001, 1)];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
}
#[test]
fn test_merge_segments_with_deletes_above_threshold_all_in_level() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
create_random_segment_meta(40_000),
];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
assert_eq!(merge_candidates[0].0.len(), 2);
}
#[test]
fn test_merge_segments_with_deletes_above_threshold_different_level_not_involved() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![
create_random_segment_meta(100),
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
assert_eq!(merge_candidates[0].0.len(), 1);
assert_eq!(merge_candidates[0].0[0], test_input[1].id());
}
} }

View File

@@ -1,6 +1,6 @@
use crate::Opstamp; use crate::Opstamp;
use crate::SegmentId; use crate::SegmentId;
use crate::{Inventory, TrackedObject}; use census::{Inventory, TrackedObject};
use std::collections::HashSet; use std::collections::HashSet;
use std::ops::Deref; use std::ops::Deref;

File diff suppressed because it is too large Load Diff

View File

@@ -1,17 +1,22 @@
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::collector::TopDocs; use crate::fastfield::{DeleteBitSet, FastFieldReader};
use crate::core::Index; use crate::schema::IndexRecordOption;
use crate::fastfield::MultiValuedFastFieldReader; use crate::{
use crate::fastfield::{AliveBitSet, FastFieldReader}; collector::TopDocs,
use crate::query::QueryParser; schema::{Cardinality, TextFieldIndexing},
use crate::schema::{ };
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing, use crate::{core::Index, fastfield::MultiValuedFastFieldReader};
use crate::{
query::QueryParser,
schema::{IntOptions, TextOptions},
};
use crate::{schema::Facet, IndexSortByField};
use crate::{schema::INDEXED, Order};
use crate::{
schema::{self, BytesOptions},
DocAddress,
}; };
use crate::schema::{IntOptions, TextOptions};
use crate::DocAddress;
use crate::IndexSortByField;
use crate::Order;
use crate::{DocSet, IndexSettings, Postings, Term}; use crate::{DocSet, IndexSettings, Postings, Term};
use futures::executor::block_on; use futures::executor::block_on;
@@ -22,7 +27,7 @@ mod tests {
.set_indexed(); .set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options); let int_field = schema_builder.add_u64_field("intval", int_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -34,17 +39,14 @@ mod tests {
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime"))) index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
.unwrap(); index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
index_writer
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime"))) assert!(index_writer.commit().is_ok());
.unwrap(); index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
index_writer.commit().unwrap();
index_writer assert!(index_writer.commit().is_ok());
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
.unwrap();
index_writer.commit().unwrap();
} }
// Merging the segments // Merging the segments
@@ -64,7 +66,7 @@ mod tests {
fn create_test_index( fn create_test_index(
index_settings: Option<IndexSettings>, index_settings: Option<IndexSettings>,
force_disjunct_segment_sort_values: bool, force_disjunct_segment_sort_values: bool,
) -> crate::Result<Index> { ) -> Index {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::SingleValue) .set_fast(Cardinality::SingleValue)
@@ -74,7 +76,7 @@ mod tests {
let bytes_options = BytesOptions::default().set_fast().set_indexed(); let bytes_options = BytesOptions::default().set_fast().set_indexed();
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options); let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let multi_numbers = schema_builder.add_u64_field( let multi_numbers = schema_builder.add_u64_field(
"multi_numbers", "multi_numbers",
@@ -93,34 +95,34 @@ mod tests {
if let Some(settings) = index_settings { if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings); index_builder = index_builder.settings(settings);
} }
let index = index_builder.create_in_ram()?; let index = index_builder.create_in_ram().unwrap();
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
// segment 1 - range 1-3 // segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64))?; index_writer.add_document(doc!(int_field=>1_u64));
index_writer.add_document( index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")), doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
)?; );
index_writer.add_document( index_writer.add_document(
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"), doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
)?; );
index_writer.add_document( index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"), doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
)?; );
index_writer.commit()?; assert!(index_writer.commit().is_ok());
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20 // segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?; index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
let in_val = if force_disjunct_segment_sort_values { let in_val = if force_disjunct_segment_sort_values {
10_u64 10_u64
} else { } else {
1 1
}; };
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?; index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000 // segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values { let int_vals = if force_disjunct_segment_sort_values {
[100_u64, 50] [100_u64, 50]
@@ -129,24 +131,26 @@ mod tests {
}; };
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1] index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")), doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
)?; );
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?; index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
index_writer.add_document( index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num") doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
)?; );
index_writer.delete_term(Term::from_field_text(text_field, "deleteme")); index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index
let mut index_writer = index.writer_for_tests()?; .searchable_segment_ids()
block_on(index_writer.merge(&segment_ids))?; .expect("Searchable segments failed.");
index_writer.wait_merging_threads()?; let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
} }
Ok(index) index
} }
#[test] #[test]
@@ -179,8 +183,7 @@ mod tests {
..Default::default() ..Default::default()
}), }),
force_disjunct_segment_sort_values, force_disjunct_segment_sort_values,
) );
.unwrap();
let int_field = index.schema().get_field("intval").unwrap(); let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
@@ -254,10 +257,10 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc_freq(), 2); assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
assert_eq!( assert_eq!(
postings.doc_freq_given_deletes( postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset) segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
), ),
2 2
); );
@@ -297,8 +300,7 @@ mod tests {
..Default::default() ..Default::default()
}), }),
false, false,
) );
.unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -334,10 +336,10 @@ mod tests {
.unwrap() .unwrap()
.unwrap(); .unwrap();
assert_eq!(postings.doc_freq(), 2); assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
assert_eq!( assert_eq!(
postings.doc_freq_given_deletes( postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset) segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
), ),
2 2
); );
@@ -365,8 +367,7 @@ mod tests {
..Default::default() ..Default::default()
}), }),
false, false,
) );
.unwrap();
let int_field = index.schema().get_field("intval").unwrap(); let int_field = index.schema().get_field("intval").unwrap();
let multi_numbers = index.schema().get_field("multi_numbers").unwrap(); let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
@@ -445,10 +446,10 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc_freq(), 2); assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
assert_eq!( assert_eq!(
postings.doc_freq_given_deletes( postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset) segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
), ),
2 2
); );
@@ -516,7 +517,7 @@ mod bench_sorted_index_merge {
let index_doc = |index_writer: &mut IndexWriter, val: u64| { let index_doc = |index_writer: &mut IndexWriter, val: u64| {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, val); doc.add_u64(int_field, val);
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
}; };
// 3 segments with 10_000 values in the fast fields // 3 segments with 10_000 values in the fast fields
for _ in 0..3 { for _ in 0..3 {
@@ -545,15 +546,14 @@ mod bench_sorted_index_merge {
let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap(); let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
b.iter(|| { b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)|{ let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
let reader = &merger.readers[*ordinal as usize]; let u64_reader: DynamicFastFieldReader<u64> = reader.reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields() .fast_fields()
.typed_fast_field_reader(field) .typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
(doc_id, reader, u64_reader) (doc_id, reader, u64_reader)
}); });
// add values in order of the new doc_ids // add values in order of the new docids
let mut val = 0; let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids { for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get(*doc_id); val = field_reader.get(*doc_id);
@@ -566,7 +566,7 @@ mod bench_sorted_index_merge {
Ok(()) Ok(())
} }
#[bench] #[bench]
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> { fn create_sorted_index_create_docid_mapping(b: &mut Bencher) -> crate::Result<()> {
let sort_by_field = IndexSortByField { let sort_by_field = IndexSortByField {
field: "intval".to_string(), field: "intval".to_string(),
order: Order::Desc, order: Order::Desc,

View File

@@ -1,17 +1,15 @@
pub mod delete_queue; pub mod delete_queue;
pub mod demuxer;
pub mod doc_id_mapping; pub mod doc_id_mapping;
mod doc_opstamp_mapping; mod doc_opstamp_mapping;
pub mod index_writer; pub mod index_writer;
mod index_writer_status;
mod log_merge_policy; mod log_merge_policy;
mod merge_operation; mod merge_operation;
pub mod merge_policy; pub mod merge_policy;
pub mod merger; pub mod merger;
mod merger_sorted_index_test; mod merger_sorted_index_test;
pub mod operation; pub mod operation;
pub mod prepared_commit; mod prepared_commit;
mod segment_entry; mod segment_entry;
mod segment_manager; mod segment_manager;
mod segment_register; mod segment_register;
@@ -20,11 +18,6 @@ pub mod segment_updater;
mod segment_writer; mod segment_writer;
mod stamper; mod stamper;
use crossbeam::channel;
use smallvec::SmallVec;
use crate::indexer::operation::AddOperation;
pub use self::index_writer::IndexWriter; pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy; pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation; pub use self::merge_operation::MergeOperation;
@@ -33,23 +26,12 @@ pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry; pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager; pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer; pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::merge_filtered_segments; pub use self::segment_updater::merge_segments;
pub use self::segment_updater::merge_indices;
pub use self::segment_writer::SegmentWriter; pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
// Batch of documents.
// Most of the time, users will send operation one-by-one, but it can be useful to
// send them as a small block to ensure that
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests_mmap { mod tests_mmap {
@@ -57,20 +39,19 @@ mod tests_mmap {
use crate::{Index, Term}; use crate::{Index, Term};
#[test] #[test]
fn test_advance_delete_bug() -> crate::Result<()> { fn test_advance_delete_bug() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?; let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
// there must be one deleted document in the segment // there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));
// we need enough data to trigger the bug (at least 32 documents) // we need enough data to trigger the bug (at least 32 documents)
for _ in 0..32 { for _ in 0..32 {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
} }
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.commit()?; index_writer.commit().unwrap();
Ok(())
} }
} }

View File

@@ -18,38 +18,25 @@ impl<'a> PreparedCommit<'a> {
} }
} }
/// Returns the opstamp associated to the prepared commit.
pub fn opstamp(&self) -> Opstamp { pub fn opstamp(&self) -> Opstamp {
self.opstamp self.opstamp
} }
/// Adds an arbitrary payload to the commit.
pub fn set_payload(&mut self, payload: &str) { pub fn set_payload(&mut self, payload: &str) {
self.payload = Some(payload.to_string()) self.payload = Some(payload.to_string())
} }
/// Rollbacks any change.
pub fn abort(self) -> crate::Result<Opstamp> { pub fn abort(self) -> crate::Result<Opstamp> {
self.index_writer.rollback() self.index_writer.rollback()
} }
/// Proceeds to commit.
/// See `.commit_async()`.
pub fn commit(self) -> crate::Result<Opstamp> { pub fn commit(self) -> crate::Result<Opstamp> {
block_on(self.commit_async())
}
/// Proceeds to commit.
///
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
/// this operation is not at all really light.
/// At this point deletes have not been flushed yet.
pub async fn commit_async(self) -> crate::Result<Opstamp> {
info!("committing {}", self.opstamp); info!("committing {}", self.opstamp);
self.index_writer let _ = block_on(
.segment_updater() self.index_writer
.schedule_commit(self.opstamp, self.payload) .segment_updater()
.await?; .schedule_commit(self.opstamp, self.payload),
);
Ok(self.opstamp) Ok(self.opstamp)
} }
} }

View File

@@ -9,16 +9,18 @@ use std::fmt;
/// ///
/// In addition to segment `meta`, /// In addition to segment `meta`,
/// it contains a few transient states /// it contains a few transient states
/// - `alive_bitset` is a bitset describing /// - `state` expresses whether the segment is already in the
/// documents that were alive during the commit /// middle of a merge
/// - `delete_bitset` is a bitset describing
/// documents that were deleted during the commit
/// itself. /// itself.
/// - `delete_cursor` is the position in the delete queue. /// - `delete_cursor` is the position in the delete queue.
/// Deletes happening before the cursor are reflected either /// Deletes happening before the cursor are reflected either
/// in the .del file or in the `alive_bitset`. /// in the .del file or in the `delete_bitset`.
#[derive(Clone)] #[derive(Clone)]
pub struct SegmentEntry { pub struct SegmentEntry {
meta: SegmentMeta, meta: SegmentMeta,
alive_bitset: Option<BitSet>, delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor, delete_cursor: DeleteCursor,
} }
@@ -27,11 +29,11 @@ impl SegmentEntry {
pub fn new( pub fn new(
segment_meta: SegmentMeta, segment_meta: SegmentMeta,
delete_cursor: DeleteCursor, delete_cursor: DeleteCursor,
alive_bitset: Option<BitSet>, delete_bitset: Option<BitSet>,
) -> SegmentEntry { ) -> SegmentEntry {
SegmentEntry { SegmentEntry {
meta: segment_meta, meta: segment_meta,
alive_bitset, delete_bitset,
delete_cursor, delete_cursor,
} }
} }
@@ -39,8 +41,8 @@ impl SegmentEntry {
/// Return a reference to the segment entry deleted bitset. /// Return a reference to the segment entry deleted bitset.
/// ///
/// `DocId` in this bitset are flagged as deleted. /// `DocId` in this bitset are flagged as deleted.
pub fn alive_bitset(&self) -> Option<&BitSet> { pub fn delete_bitset(&self) -> Option<&BitSet> {
self.alive_bitset.as_ref() self.delete_bitset.as_ref()
} }
/// Set the `SegmentMeta` for this segment. /// Set the `SegmentMeta` for this segment.

View File

@@ -66,10 +66,13 @@ impl SegmentRegister {
} }
pub fn segment_metas(&self) -> Vec<SegmentMeta> { pub fn segment_metas(&self) -> Vec<SegmentMeta> {
self.segment_states let mut segment_ids: Vec<SegmentMeta> = self
.segment_states
.values() .values()
.map(|segment_entry| segment_entry.meta().clone()) .map(|segment_entry| segment_entry.meta().clone())
.collect() .collect();
segment_ids.sort_by_key(SegmentMeta::id);
segment_ids
} }
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool { pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {

View File

@@ -7,7 +7,6 @@ use crate::core::SegmentId;
use crate::core::SegmentMeta; use crate::core::SegmentMeta;
use crate::core::META_FILEPATH; use crate::core::META_FILEPATH;
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult}; use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::fastfield::AliveBitSet;
use crate::indexer::delete_queue::DeleteCursor; use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::index_writer::advance_deletes; use crate::indexer::index_writer::advance_deletes;
use crate::indexer::merge_operation::MergeOperationInventory; use crate::indexer::merge_operation::MergeOperationInventory;
@@ -20,15 +19,12 @@ use crate::indexer::{DefaultMergePolicy, MergePolicy};
use crate::indexer::{MergeCandidate, MergeOperation}; use crate::indexer::{MergeCandidate, MergeOperation};
use crate::schema::Schema; use crate::schema::Schema;
use crate::Opstamp; use crate::Opstamp;
use crate::TantivyError;
use fail::fail_point;
use futures::channel::oneshot; use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder}; use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::Future; use futures::future::Future;
use futures::future::TryFutureExt; use futures::future::TryFutureExt;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashSet; use std::collections::HashSet;
use std::io;
use std::io::Write; use std::io::Write;
use std::ops::Deref; use std::ops::Deref;
use std::path::PathBuf; use std::path::PathBuf;
@@ -61,9 +57,7 @@ pub fn save_new_metas(
payload: None, payload: None,
}, },
directory, directory,
)?; )
directory.sync_directory()?;
Ok(())
} }
/// Save the index meta file. /// Save the index meta file.
@@ -80,11 +74,6 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
let mut buffer = serde_json::to_vec_pretty(metas)?; let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer. // Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?; writeln!(&mut buffer)?;
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
))));
directory.sync_directory()?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?; directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
Ok(()) Ok(())
@@ -170,9 +159,9 @@ fn merge(
/// meant to work if you have an IndexWriter running for the origin indices, or /// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index. /// the destination Index.
#[doc(hidden)] #[doc(hidden)]
pub fn merge_indices<T: Into<Box<dyn Directory>>>( pub fn merge_segments<Dir: Directory>(
indices: &[Index], indices: &[Index],
output_directory: T, output_directory: Dir,
) -> crate::Result<Index> { ) -> crate::Result<Index> {
if indices.is_empty() { if indices.is_empty() {
// If there are no indices to merge, there is no need to do anything. // If there are no indices to merge, there is no need to do anything.
@@ -181,8 +170,19 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
)); ));
} }
let target_schema = indices[0].schema();
let target_settings = indices[0].settings().clone(); let target_settings = indices[0].settings().clone();
// let's check that all of the indices have the same schema
if indices
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
// let's check that all of the indices have the same index settings // let's check that all of the indices have the same index settings
if indices if indices
.iter() .iter()
@@ -199,61 +199,13 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
segments.extend(index.searchable_segments()?); segments.extend(index.searchable_segments()?);
} }
let non_filter = segments.iter().map(|_| None).collect::<Vec<_>>(); let mut merged_index = Index::create(output_directory, target_schema.clone(), target_settings)?;
merge_filtered_segments(&segments, target_settings, non_filter, output_directory)
}
/// Advanced: Merges a list of segments from different indices in a new index.
/// Additional you can provide a delete bitset for each segment to ignore doc_ids.
///
/// Returns `TantivyError` if the the indices list is empty or their
/// schemas don't match.
///
/// `output_directory`: is assumed to be empty.
///
/// # Warning
/// This function does NOT check or take the `IndexWriter` is running. It is not
/// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index.
#[doc(hidden)]
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
segments: &[Segment],
target_settings: IndexSettings,
filter_doc_ids: Vec<Option<AliveBitSet>>,
output_directory: T,
) -> crate::Result<Index> {
if segments.is_empty() {
// If there are no indices to merge, there is no need to do anything.
return Err(crate::TantivyError::InvalidArgument(
"No segments given to marge".to_string(),
));
}
let target_schema = segments[0].schema();
// let's check that all of the indices have the same schema
if segments
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
let mut merged_index = Index::create(
output_directory,
target_schema.clone(),
target_settings.clone(),
)?;
let merged_segment = merged_index.new_segment(); let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id(); let merged_segment_id = merged_segment.id();
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set( let merger: IndexMerger = IndexMerger::open(
merged_index.schema(), merged_index.schema(),
merged_index.settings().clone(), merged_index.settings().clone(),
segments, &segments[..],
filter_doc_ids,
)?; )?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?; let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let num_docs = merger.write(segment_serializer)?; let num_docs = merger.write(segment_serializer)?;
@@ -273,7 +225,7 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
); );
let index_meta = IndexMeta { let index_meta = IndexMeta {
index_settings: target_settings, // index_settings of all segments should be the same index_settings: indices[0].load_metas()?.index_settings, // index_settings of all segments should be the same
segments: vec![segment_meta], segments: vec![segment_meta],
schema: target_schema, schema: target_schema,
opstamp: 0u64, opstamp: 0u64,
@@ -354,39 +306,37 @@ impl SegmentUpdater {
*self.merge_policy.write().unwrap() = arc_merge_policy; *self.merge_policy.write().unwrap() = arc_merge_policy;
} }
async fn schedule_task< fn schedule_future<T: 'static + Send, F: Future<Output = crate::Result<T>> + 'static + Send>(
T: 'static + Send,
F: Future<Output = crate::Result<T>> + 'static + Send,
>(
&self, &self,
task: F, f: F,
) -> crate::Result<T> { ) -> impl Future<Output = crate::Result<T>> {
if !self.is_alive() {
return Err(crate::TantivyError::SystemError(
"Segment updater killed".to_string(),
));
}
let (sender, receiver) = oneshot::channel(); let (sender, receiver) = oneshot::channel();
self.pool.spawn_ok(async move { if self.is_alive() {
let task_result = task.await; self.pool.spawn_ok(async move {
let _ = sender.send(task_result); let _ = sender.send(f.await);
}); });
let task_result = receiver.await; } else {
task_result.unwrap_or_else(|_| { let _ = sender.send(Err(crate::TantivyError::SystemError(
"Segment updater killed".to_string(),
)));
}
receiver.unwrap_or_else(|_| {
let err_msg = let err_msg =
"A segment_updater future did not success. This should never happen.".to_string(); "A segment_updater future did not success. This should never happen.".to_string();
Err(crate::TantivyError::SystemError(err_msg)) Err(crate::TantivyError::SystemError(err_msg))
}) })
} }
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> { pub fn schedule_add_segment(
&self,
segment_entry: SegmentEntry,
) -> impl Future<Output = crate::Result<()>> {
let segment_updater = self.clone(); let segment_updater = self.clone();
self.schedule_task(async move { self.schedule_future(async move {
segment_updater.segment_manager.add_segment(segment_entry); segment_updater.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options().await; segment_updater.consider_merge_options().await;
Ok(()) Ok(())
}) })
.await
} }
/// Orders `SegmentManager` to remove all segments /// Orders `SegmentManager` to remove all segments
@@ -453,9 +403,11 @@ impl SegmentUpdater {
Ok(()) Ok(())
} }
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> { pub fn schedule_garbage_collect(
&self,
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
let garbage_collect_future = garbage_collect_files(self.clone()); let garbage_collect_future = garbage_collect_files(self.clone());
self.schedule_task(garbage_collect_future).await self.schedule_future(garbage_collect_future)
} }
/// List the files that are useful to the index. /// List the files that are useful to the index.
@@ -473,13 +425,13 @@ impl SegmentUpdater {
files files
} }
pub(crate) async fn schedule_commit( pub fn schedule_commit(
&self, &self,
opstamp: Opstamp, opstamp: Opstamp,
payload: Option<String>, payload: Option<String>,
) -> crate::Result<()> { ) -> impl Future<Output = crate::Result<()>> {
let segment_updater: SegmentUpdater = self.clone(); let segment_updater: SegmentUpdater = self.clone();
self.schedule_task(async move { self.schedule_future(async move {
let segment_entries = segment_updater.purge_deletes(opstamp)?; let segment_entries = segment_updater.purge_deletes(opstamp)?;
segment_updater.segment_manager.commit(segment_entries); segment_updater.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp, payload)?; segment_updater.save_metas(opstamp, payload)?;
@@ -487,7 +439,6 @@ impl SegmentUpdater {
segment_updater.consider_merge_options().await; segment_updater.consider_merge_options().await;
Ok(()) Ok(())
}) })
.await
} }
fn store_meta(&self, index_meta: &IndexMeta) { fn store_meta(&self, index_meta: &IndexMeta) {
@@ -562,7 +513,9 @@ impl SegmentUpdater {
e e
); );
// ... cancel merge // ... cancel merge
assert!(!cfg!(test), "Merge failed."); if cfg!(test) {
panic!("Merge failed.");
}
} }
} }
}); });
@@ -615,14 +568,14 @@ impl SegmentUpdater {
} }
} }
async fn end_merge( fn end_merge(
&self, &self,
merge_operation: MergeOperation, merge_operation: MergeOperation,
mut after_merge_segment_entry: SegmentEntry, mut after_merge_segment_entry: SegmentEntry,
) -> crate::Result<SegmentMeta> { ) -> impl Future<Output = crate::Result<SegmentMeta>> {
let segment_updater = self.clone(); let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone(); let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
self.schedule_task(async move { let end_merge_future = self.schedule_future(async move {
info!("End merge {:?}", after_merge_segment_entry.meta()); info!("End merge {:?}", after_merge_segment_entry.meta());
{ {
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
@@ -641,8 +594,9 @@ impl SegmentUpdater {
merge_operation.segment_ids(), merge_operation.segment_ids(),
advance_deletes_err advance_deletes_err
); );
assert!(!cfg!(test), "Merge failed."); if cfg!(test) {
panic!("Merge failed.");
}
// ... cancel merge // ... cancel merge
// `merge_operations` are tracked. As it is dropped, the // `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge. // the segment_ids will be available again for merge.
@@ -665,9 +619,8 @@ impl SegmentUpdater {
let _ = garbage_collect_files(segment_updater).await; let _ = garbage_collect_files(segment_updater).await;
Ok(()) Ok(())
}) });
.await?; end_merge_future.map_ok(|_| after_merge_segment_meta)
Ok(after_merge_segment_meta)
} }
/// Wait for current merging threads. /// Wait for current merging threads.
@@ -693,19 +646,11 @@ impl SegmentUpdater {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::merge_indices; use super::merge_segments;
use crate::collector::TopDocs;
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::fastfield::AliveBitSet;
use crate::indexer::merge_policy::tests::MergeWheneverPossible; use crate::indexer::merge_policy::tests::MergeWheneverPossible;
use crate::indexer::merger::IndexMerger;
use crate::indexer::segment_updater::merge_filtered_segments;
use crate::query::QueryParser;
use crate::schema::*; use crate::schema::*;
use crate::Directory;
use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::Segment;
#[test] #[test]
fn test_delete_during_merge() -> crate::Result<()> { fn test_delete_during_merge() -> crate::Result<()> {
@@ -718,19 +663,19 @@ mod tests {
index_writer.set_merge_policy(Box::new(MergeWheneverPossible)); index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
} }
index_writer.commit()?; index_writer.commit()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"))?; index_writer.add_document(doc!(text_field=>"d"));
} }
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"))?; index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"))?; index_writer.add_document(doc!(text_field=>"f"));
index_writer.commit()?; index_writer.commit()?;
let term = Term::from_field_text(text_field, "a"); let term = Term::from_field_text(text_field, "a");
@@ -748,50 +693,6 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn delete_all_docs_min() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_for_tests()?;
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"b"))?;
}
index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?;
// docs exist, should have at least 1 segment
assert!(!seg_ids.is_empty());
let term = Term::from_field_text(text_field, "a");
index_writer.delete_term(term);
index_writer.commit()?;
let term = Term::from_field_text(text_field, "b");
index_writer.delete_term(term);
index_writer.commit()?;
index_writer.wait_merging_threads()?;
let reader = index.reader()?;
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index.searchable_segment_ids()?;
assert!(seg_ids.is_empty());
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas()?.is_empty());
assert!(reader.searcher().segment_readers().is_empty());
Ok(())
}
#[test] #[test]
fn delete_all_docs() -> crate::Result<()> { fn delete_all_docs() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -802,19 +703,19 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
} }
index_writer.commit()?; index_writer.commit()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"))?; index_writer.add_document(doc!(text_field=>"d"));
} }
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"))?; index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"))?; index_writer.add_document(doc!(text_field=>"f"));
index_writer.commit()?; index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?; let seg_ids = index.searchable_segment_ids()?;
@@ -854,8 +755,8 @@ mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
} }
index_writer.commit()?; index_writer.commit()?;
@@ -881,22 +782,22 @@ mod tests {
// writing two segments // writing two segments
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"fizz"))?; index_writer.add_document(doc!(text_field=>"fizz"));
index_writer.add_document(doc!(text_field=>"buzz"))?; index_writer.add_document(doc!(text_field=>"buzz"));
} }
index_writer.commit()?; index_writer.commit()?;
for _ in 0..1000 { for _ in 0..1000 {
index_writer.add_document(doc!(text_field=>"foo"))?; index_writer.add_document(doc!(text_field=>"foo"));
index_writer.add_document(doc!(text_field=>"bar"))?; index_writer.add_document(doc!(text_field=>"bar"));
} }
index_writer.commit()?; index_writer.commit()?;
indices.push(index); indices.push(index);
} }
assert_eq!(indices.len(), 3); assert_eq!(indices.len(), 3);
let output_directory: Box<dyn Directory> = Box::new(RamDirectory::default()); let output_directory = RamDirectory::default();
let index = merge_indices(&indices, output_directory)?; let index = merge_segments(&indices, output_directory)?;
assert_eq!(index.schema(), schema); assert_eq!(index.schema(), schema);
let segments = index.searchable_segments()?; let segments = index.searchable_segments()?;
@@ -910,7 +811,7 @@ mod tests {
#[test] #[test]
fn test_merge_empty_indices_array() { fn test_merge_empty_indices_array() {
let merge_result = merge_indices(&[], RamDirectory::default()); let merge_result = merge_segments(&[], RamDirectory::default());
assert!(merge_result.is_err()); assert!(merge_result.is_err());
} }
@@ -921,7 +822,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text"))?; index_writer.add_document(doc!(text_field=>"some text"));
index_writer.commit()?; index_writer.commit()?;
index index
}; };
@@ -931,197 +832,15 @@ mod tests {
let body_field = schema_builder.add_text_field("body", TEXT); let body_field = schema_builder.add_text_field("body", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(body_field=>"some body"))?; index_writer.add_document(doc!(body_field=>"some body"));
index_writer.commit()?; index_writer.commit()?;
index index
}; };
// mismatched schema index list // mismatched schema index list
let result = merge_indices(&[first_index, second_index], RamDirectory::default()); let result = merge_segments(&[first_index, second_index], RamDirectory::default());
assert!(result.is_err()); assert!(result.is_err());
Ok(()) Ok(())
} }
#[test]
fn test_merge_filtered_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 1"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
segments.extend(second_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let filter_segment_1 = AliveBitSet::for_test_from_deleted_docs(&[1], 2);
let filter_segment_2 = AliveBitSet::for_test_from_deleted_docs(&[0], 2);
let filter_segments = vec![Some(filter_segment_1), Some(filter_segment_2)];
let merged_index = merge_filtered_segments(
&segments,
target_settings,
filter_segments,
RamDirectory::default(),
)?;
let segments = merged_index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 1);
Ok(())
}
#[test]
fn test_merge_single_filtered_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"test text"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[0], 4);
let filter_segments = vec![Some(filter_segment)];
let index = merge_filtered_segments(
&segments,
target_settings,
filter_segments,
RamDirectory::default(),
)?;
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 2);
let searcher = index.reader()?.searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("test"), vec![] as Vec<u32>);
assert_eq!(do_search("text"), vec![0, 1]);
}
Ok(())
}
#[test]
fn test_apply_doc_id_filter_in_merger() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 1"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
{
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[1], 4);
let filter_segments = vec![Some(filter_segment)];
let target_schema = segments[0].schema();
let merged_index = Index::create(
RamDirectory::default(),
target_schema.clone(),
target_settings.clone(),
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
assert_eq!(doc_ids_alive, vec![0, 2]);
}
{
let filter_segments = vec![None];
let target_schema = segments[0].schema();
let merged_index = Index::create(
RamDirectory::default(),
target_schema.clone(),
target_settings.clone(),
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
assert_eq!(doc_ids_alive, vec![0, 1, 2]);
}
Ok(())
}
} }

View File

@@ -2,6 +2,7 @@ use super::{
doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}, doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping},
operation::AddOperation, operation::AddOperation,
}; };
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::indexer::segment_serializer::SegmentSerializer; use crate::indexer::segment_serializer::SegmentSerializer;
use crate::postings::compute_table_size; use crate::postings::compute_table_size;
@@ -17,7 +18,6 @@ use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp; use crate::Opstamp;
use crate::{core::Segment, store::StoreWriter}; use crate::{core::Segment, store::StoreWriter};
use crate::{fastfield::FastFieldsWriter, schema::Type};
use crate::{DocId, SegmentComponent}; use crate::{DocId, SegmentComponent};
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
@@ -173,11 +173,18 @@ impl SegmentWriter {
let (term_buffer, multifield_postings) = let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings); (&mut self.term_buffer, &mut self.multifield_postings);
match *field_entry.field_type() { match *field_entry.field_type() {
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
term_buffer.set_field(Type::Facet, field); term_buffer.set_field(field);
for field_value in field_values { let facets =
let facet = field_value.value().facet().ok_or_else(make_schema_error)?; field_values
let facet_str = facet.encoded_str(); .iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
});
for facet_str in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
FacetTokenizer FacetTokenizer
.token_stream(facet_str) .token_stream(facet_str)
@@ -234,11 +241,12 @@ impl SegmentWriter {
term_buffer, term_buffer,
) )
}; };
self.fieldnorms_writer.record(doc_id, field, num_tokens); self.fieldnorms_writer.record(doc_id, field, num_tokens);
} }
FieldType::U64(_) => { FieldType::U64(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::U64, field_value.field()); term_buffer.set_field(field_value.field());
let u64_val = field_value let u64_val = field_value
.value() .value()
.u64_value() .u64_value()
@@ -249,7 +257,7 @@ impl SegmentWriter {
} }
FieldType::Date(_) => { FieldType::Date(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::Date, field_value.field()); term_buffer.set_field(field_value.field());
let date_val = field_value let date_val = field_value
.value() .value()
.date_value() .date_value()
@@ -260,7 +268,7 @@ impl SegmentWriter {
} }
FieldType::I64(_) => { FieldType::I64(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::I64, field_value.field()); term_buffer.set_field(field_value.field());
let i64_val = field_value let i64_val = field_value
.value() .value()
.i64_value() .i64_value()
@@ -271,7 +279,7 @@ impl SegmentWriter {
} }
FieldType::F64(_) => { FieldType::F64(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::F64, field_value.field()); term_buffer.set_field(field_value.field());
let f64_val = field_value let f64_val = field_value
.value() .value()
.f64_value() .f64_value()
@@ -282,7 +290,7 @@ impl SegmentWriter {
} }
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::Bytes, field_value.field()); term_buffer.set_field(field_value.field());
let bytes = field_value let bytes = field_value
.value() .value()
.bytes_value() .bytes_value()

View File

@@ -10,8 +10,6 @@
)] )]
#![doc(test(attr(allow(unused_variables), deny(warnings))))] #![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)] #![warn(missing_docs)]
#![allow(clippy::len_without_is_empty)]
#![allow(clippy::return_self_not_must_use)]
//! # `tantivy` //! # `tantivy`
//! //!
@@ -64,7 +62,7 @@
//! body => "He was an old man who fished alone in a skiff in \ //! body => "He was an old man who fished alone in a skiff in \
//! the Gulf Stream and he had gone eighty-four days \ //! the Gulf Stream and he had gone eighty-four days \
//! now without taking a fish." //! now without taking a fish."
//! ))?; //! ));
//! //!
//! // We need to call .commit() explicitly to force the //! // We need to call .commit() explicitly to force the
//! // index_writer to finish processing the documents in the queue, //! // index_writer to finish processing the documents in the queue,
@@ -105,7 +103,7 @@
//! A good place for you to get started is to check out //! A good place for you to get started is to check out
//! the example code ( //! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) / //! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/quickwit-inc/tantivy/blob/main/examples/basic_search.rs)) //! [source code](https://github.com/tantivy-search/tantivy/blob/main/examples/basic_search.rs))
#[cfg_attr(test, macro_use)] #[cfg_attr(test, macro_use)]
extern crate serde_json; extern crate serde_json;
@@ -158,7 +156,7 @@ pub mod termdict;
mod reader; mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer}; pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
mod snippet; mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator}; pub use self::snippet::{Snippet, SnippetGenerator};
@@ -166,20 +164,17 @@ mod docset;
pub use self::docset::{DocSet, TERMINATED}; pub use self::docset::{DocSet, TERMINATED};
pub use crate::core::{Executor, SegmentComponent}; pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{ pub use crate::core::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
SearcherGeneration, Segment, SegmentId, SegmentMeta, SegmentId, SegmentMeta,
}; };
pub use crate::core::{InvertedIndexReader, SegmentReader}; pub use crate::core::{InvertedIndexReader, SegmentReader};
pub use crate::directory::Directory; pub use crate::directory::Directory;
pub use crate::indexer::demuxer::*; pub use crate::indexer::merge_segments;
pub use crate::indexer::merge_filtered_segments;
pub use crate::indexer::merge_indices;
pub use crate::indexer::operation::UserOperation; pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::{IndexWriter, PreparedCommit}; pub use crate::indexer::IndexWriter;
pub use crate::postings::Postings; pub use crate::postings::Postings;
pub use crate::reader::LeasedItem; pub use crate::reader::LeasedItem;
pub use crate::schema::{Document, Term}; pub use crate::schema::{Document, Term};
pub use census::{Inventory, TrackedObject};
pub use common::HasLen; pub use common::HasLen;
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use std::fmt; use std::fmt;
@@ -239,7 +234,6 @@ pub fn version_string() -> &'static str {
pub mod merge_policy { pub mod merge_policy {
pub use crate::indexer::DefaultMergePolicy; pub use crate::indexer::DefaultMergePolicy;
pub use crate::indexer::LogMergePolicy; pub use crate::indexer::LogMergePolicy;
pub use crate::indexer::MergeCandidate;
pub use crate::indexer::MergePolicy; pub use crate::indexer::MergePolicy;
pub use crate::indexer::NoMergePolicy; pub use crate::indexer::NoMergePolicy;
} }
@@ -382,22 +376,24 @@ pub mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?; let index = Index::create_from_tempdir(schema).unwrap();
// writing the segment
let mut index_writer = index.writer_for_tests()?;
{ {
let doc = doc!(text_field=>"af b"); // writing the segment
index_writer.add_document(doc)?; let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
} }
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc)?;
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc)?;
}
index_writer.commit()?;
Ok(()) Ok(())
} }
@@ -407,12 +403,12 @@ pub mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"))?; index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -434,7 +430,7 @@ pub mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.commit()?;
let index_reader = index.reader()?; let index_reader = index.reader()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
@@ -456,9 +452,9 @@ pub mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -500,20 +496,20 @@ pub mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.add_document(doc!(text_field=>" a c"))?; index_writer.add_document(doc!(text_field=>" a c"));
// 2 // 2
index_writer.add_document(doc!(text_field=>" b c"))?; index_writer.add_document(doc!(text_field=>" b c"));
// 3 // 3
index_writer.add_document(doc!(text_field=>" b d"))?; index_writer.add_document(doc!(text_field=>" b d"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
// 4 // 4
index_writer.add_document(doc!(text_field=>" b c"))?; index_writer.add_document(doc!(text_field=>" b c"));
// 5 // 5
index_writer.add_document(doc!(text_field=>" a"))?; index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit()?; index_writer.commit()?;
} }
{ {
@@ -547,7 +543,7 @@ pub mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback()?;
@@ -583,7 +579,7 @@ pub mod tests {
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
@@ -633,7 +629,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64))?; index_writer.add_document(doc!(field=>1u64));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -657,7 +653,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val))?; index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -681,7 +677,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI; let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val))?; index_writer.add_document(doc!(value_field => val));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -704,7 +700,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -727,14 +723,14 @@ pub mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"))?; index_writer.add_document(doc!(text_field=>"63"));
index_writer.add_document(doc!(text_field=>"70"))?; index_writer.add_document(doc!(text_field=>"70"));
index_writer.add_document(doc!(text_field=>"34"))?; index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc!(text_field=>"1"))?; index_writer.add_document(doc!(text_field=>"1"));
index_writer.add_document(doc!(text_field=>"38"))?; index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"))?; index_writer.add_document(doc!(text_field=>"33"));
index_writer.add_document(doc!(text_field=>"40"))?; index_writer.add_document(doc!(text_field=>"40"));
index_writer.add_document(doc!(text_field=>"17"))?; index_writer.add_document(doc!(text_field=>"17"));
index_writer.delete_term(Term::from_field_text(text_field, "38")); index_writer.delete_term(Term::from_field_text(text_field, "38"));
index_writer.delete_term(Term::from_field_text(text_field, "34")); index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?; index_writer.commit()?;
@@ -752,7 +748,7 @@ pub mod tests {
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?; index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.commit()?; index_writer.commit()?;
} }
{ {
@@ -784,9 +780,9 @@ pub mod tests {
let reader = index.reader()?; let reader = index.reader()?;
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"))?; index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"))?; index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?; index_writer.commit()?;
reader.reload()?; reader.reload()?;
@@ -848,9 +844,9 @@ pub mod tests {
assert_eq!(reader.searcher().num_docs(), 0u64); assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"))?; index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"))?; index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?; index_writer.commit()?;
reader.reload()?; reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 3u64); assert_eq!(reader.searcher().num_docs(), 3u64);
@@ -890,7 +886,7 @@ pub mod tests {
{ {
let document = let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64); doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document)?; index_writer.add_document(document);
index_writer.commit()?; index_writer.commit()?;
} }
let reader = index.reader()?; let reader = index.reader()?;
@@ -957,7 +953,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id))?; index_writer.add_document(doc!(id => doc_id));
} }
index_writer.commit()?; index_writer.commit()?;
@@ -974,7 +970,7 @@ pub mod tests {
index_writer.delete_term(Term::from_field_u64(id, doc_id)); index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit()?; index_writer.commit()?;
index_reader.reload()?; index_reader.reload()?;
index_writer.add_document(doc!(id => doc_id))?; index_writer.add_document(doc!(id => doc_id));
index_writer.commit()?; index_writer.commit()?;
index_reader.reload()?; index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
@@ -1009,8 +1005,8 @@ pub mod tests {
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?; let mut writer = index.writer(50_000_000)?;
for _ in 0..5000 { for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?; writer.add_document(doc!(body => "foo"));
writer.add_document(doc!(body => "boo"))?; writer.add_document(doc!(body => "boo"));
} }
writer.commit()?; writer.commit()?;
assert!(index.validate_checksum()?.is_empty()); assert!(index.validate_checksum()?.is_empty());

View File

@@ -1,5 +1,14 @@
use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
unsafe fn binary_search_step(ptr: *const u32, target: u32, half_size: isize) -> *const u32 {
let mid = ptr.offset(half_size);
if *mid < target {
mid.offset(1)
} else {
ptr
}
}
/// Search the first index containing an element greater or equal to /// Search the first index containing an element greater or equal to
/// the target. /// the target.
/// ///
@@ -21,16 +30,18 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
/// end of the last block for instance. /// end of the last block for instance.
/// - The target is assumed smaller or equal to the last element of the block. /// - The target is assumed smaller or equal to the last element of the block.
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize { pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
let mut start = 0; let start_ptr: *const u32 = &arr[0] as *const u32;
let mut len = arr.len(); unsafe {
for _ in 0..7 { let mut ptr = start_ptr;
len /= 2; ptr = binary_search_step(ptr, target, 63);
let pivot = unsafe { *arr.get_unchecked(start + len - 1) }; ptr = binary_search_step(ptr, target, 31);
if pivot < target { ptr = binary_search_step(ptr, target, 15);
start += len; ptr = binary_search_step(ptr, target, 7);
} ptr = binary_search_step(ptr, target, 3);
ptr = binary_search_step(ptr, target, 1);
let extra = if *ptr < target { 1 } else { 0 };
(ptr.offset_from(start_ptr) as usize) + extra
} }
start
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -393,8 +393,8 @@ mod tests {
} }
#[test] #[test]
fn test_block_segment_postings() -> crate::Result<()> { fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?; let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32; let mut offset: u32 = 0u32;
// checking that the `doc_freq` is correct // checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000); assert_eq!(block_segments.doc_freq(), 100_000);
@@ -409,17 +409,16 @@ mod tests {
offset += block.len() as u32; offset += block.len() as u32;
block_segments.advance(); block_segments.advance();
} }
Ok(())
} }
#[test] #[test]
fn test_skip_right_at_new_block() -> crate::Result<()> { fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>(); let mut doc_ids = (0..128).collect::<Vec<u32>>();
// 128 is missing // 128 is missing
doc_ids.push(129); doc_ids.push(129);
doc_ids.push(130); doc_ids.push(130);
{ {
let block_segments = build_block_postings(&doc_ids)?; let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None); let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(128), 129); assert_eq!(docset.seek(128), 129);
assert_eq!(docset.doc(), 129); assert_eq!(docset.doc(), 129);
@@ -428,7 +427,7 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED); assert_eq!(docset.advance(), TERMINATED);
} }
{ {
let block_segments = build_block_postings(&doc_ids).unwrap(); let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None); let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(129), 129); assert_eq!(docset.seek(129), 129);
assert_eq!(docset.doc(), 129); assert_eq!(docset.doc(), 129);
@@ -437,47 +436,46 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED); assert_eq!(docset.advance(), TERMINATED);
} }
{ {
let block_segments = build_block_postings(&doc_ids)?; let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None); let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.doc(), 0); assert_eq!(docset.doc(), 0);
assert_eq!(docset.seek(131), TERMINATED); assert_eq!(docset.seek(131), TERMINATED);
assert_eq!(docset.doc(), TERMINATED); assert_eq!(docset.doc(), TERMINATED);
} }
Ok(())
} }
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> { fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32; let mut last_doc = 0u32;
for &doc in docs { for &doc in docs {
for _ in last_doc..doc { for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64))?; index_writer.add_document(doc!(int_field=>1u64));
} }
index_writer.add_document(doc!(int_field=>0u64))?; index_writer.add_document(doc!(int_field=>0u64));
last_doc = doc + 1; last_doc = doc + 1;
} }
index_writer.commit()?; index_writer.commit().unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field).unwrap(); let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let term = Term::from_field_u64(int_field, 0u64); let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term)?.unwrap(); let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
let block_postings = inverted_index inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
Ok(block_postings) .unwrap()
} }
#[test] #[test]
fn test_block_segment_postings_seek() -> crate::Result<()> { fn test_block_segment_postings_seek() {
let mut docs = vec![0]; let mut docs = vec![0];
for i in 0..1300 { for i in 0..1300 {
docs.push((i * i / 100) + i); docs.push((i * i / 100) + i);
} }
let mut block_postings = build_block_postings(&docs[..])?; let mut block_postings = build_block_postings(&docs[..]);
for i in &[0, 424, 10000] { for i in &[0, 424, 10000] {
block_postings.seek(*i); block_postings.seek(*i);
let docs = block_postings.docs(); let docs = block_postings.docs();
@@ -486,7 +484,6 @@ mod tests {
} }
block_postings.seek(100_000); block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED); assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
Ok(())
} }
#[test] #[test]
@@ -500,7 +497,7 @@ mod tests {
// the other containing odd numbers. // the other containing odd numbers.
for i in 0..6 { for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64); let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();

View File

@@ -47,6 +47,7 @@ pub mod tests {
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormReader;
use crate::indexer::operation::AddOperation; use crate::indexer::operation::AddOperation;
use crate::indexer::SegmentWriter; use crate::indexer::SegmentWriter;
use crate::merge_policy::NoMergePolicy;
use crate::query::Scorer; use crate::query::Scorer;
use crate::schema::{Field, TextOptions}; use crate::schema::{Field, TextOptions};
use crate::schema::{IndexRecordOption, TextFieldIndexing}; use crate::schema::{IndexRecordOption, TextFieldIndexing};
@@ -86,12 +87,12 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#))?; index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?; index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 { for _ in 0..1_000 {
index_writer.add_document(doc!(title => r#"abc abc abc"#))?; index_writer.add_document(doc!(title => r#"abc abc abc"#));
} }
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?; index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -152,68 +153,50 @@ pub mod tests {
Ok(()) Ok(())
} }
#[test]
pub fn test_index_max_length_token() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes[..], ok_token_text.as_bytes());
Ok(())
}
#[test] #[test]
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> { pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1); let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
exceeding_token_text.push_str(" hello"); exceeding_token_text.push_str(" hello");
index_writer.add_document(doc!(text_field=>exceeding_token_text))?; let mut schema_builder = Schema::builder();
index_writer.commit()?; let text_options = TextOptions::default().set_indexing_options(
reader.reload()?; TextFieldIndexing::default()
let searcher = reader.searcher(); .set_index_option(IndexRecordOption::WithFreqsAndPositions)
let segment_reader = searcher.segment_reader(0u32); .set_tokenizer("simple_no_truncation"),
let inverted_index = segment_reader.inverted_index(text_field)?; );
assert_eq!(inverted_index.terms().num_terms(), 1); let text_field = schema_builder.add_text_field("text", text_options);
let mut bytes = vec![]; let schema = schema_builder.build();
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?); let index = Index::create_in_ram(schema);
assert_eq!(&bytes, b"hello"); index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes, b"hello");
}
{
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(1u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes[..], ok_token_text.as_bytes());
}
Ok(()) Ok(())
} }
@@ -332,13 +315,13 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c"))?; index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c"))?; index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader let mut postings = segment_reader
.inverted_index(text_field)? .inverted_index(text_field)?
@@ -367,7 +350,7 @@ pub mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for i in 0u64..num_docs as u64 { for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64); let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -617,7 +600,7 @@ mod bench {
doc.add_text(text_field, "c"); doc.add_text(text_field, "c");
} }
doc.add_text(text_field, "d"); doc.add_text(text_field, "d");
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }

View File

@@ -5,8 +5,8 @@ use crate::postings::recorder::{
}; };
use crate::postings::UnorderedTermId; use crate::postings::UnorderedTermId;
use crate::postings::{FieldSerializer, InvertedIndexSerializer}; use crate::postings::{FieldSerializer, InvertedIndexSerializer};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term}; use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
use crate::schema::{IndexRecordOption, Type};
use crate::termdict::TermOrdinal; use crate::termdict::TermOrdinal;
use crate::tokenizer::TokenStream; use crate::tokenizer::TokenStream;
use crate::tokenizer::{Token, MAX_TOKEN_LEN}; use crate::tokenizer::{Token, MAX_TOKEN_LEN};
@@ -33,13 +33,15 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed() SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
} }
}) })
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed), .unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) FieldType::U64(_)
| FieldType::I64(_) | FieldType::I64(_)
| FieldType::F64(_) | FieldType::F64(_)
| FieldType::Date(_) | FieldType::Date(_)
| FieldType::Bytes(_) | FieldType::Bytes(_)
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(), | FieldType::HierarchicalFacet(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
} }
} }
@@ -51,11 +53,11 @@ pub struct MultiFieldPostingsWriter {
} }
fn make_field_partition( fn make_field_partition(
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)], term_offsets: &[(&[u8], Addr, UnorderedTermId)],
) -> Vec<(Field, Range<usize>)> { ) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets let term_offsets_it = term_offsets
.iter() .iter()
.map(|(term, _, _)| term.field()) .map(|(key, _, _)| Term::wrap(key).field())
.enumerate(); .enumerate();
let mut prev_field_opt = None; let mut prev_field_opt = None;
let mut fields = vec![]; let mut fields = vec![];
@@ -130,10 +132,9 @@ impl MultiFieldPostingsWriter {
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> { ) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> = let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
Vec::with_capacity(self.term_index.len()); self.term_index.iter().collect();
term_offsets.extend(self.term_index.iter()); term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> = let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new(); HashMap::new();
@@ -144,7 +145,7 @@ impl MultiFieldPostingsWriter {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() { match *field_entry.field_type() {
FieldType::Str(_) | FieldType::Facet(_) => { FieldType::Str(_) | FieldType::HierarchicalFacet(_) => {
// populating the (unordered term ord) -> (ordered term ord) mapping // populating the (unordered term ord) -> (ordered term ord) mapping
// for the field. // for the field.
let unordered_term_ids = term_offsets[byte_offsets.clone()] let unordered_term_ids = term_offsets[byte_offsets.clone()]
@@ -208,7 +209,7 @@ pub trait PostingsWriter {
/// The actual serialization format is handled by the `PostingsSerializer`. /// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize( fn serialize(
&self, &self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], term_addrs: &[(&[u8], Addr, UnorderedTermId)],
serializer: &mut FieldSerializer<'_>, serializer: &mut FieldSerializer<'_>,
term_heap: &MemoryArena, term_heap: &MemoryArena,
heap: &MemoryArena, heap: &MemoryArena,
@@ -225,7 +226,7 @@ pub trait PostingsWriter {
heap: &mut MemoryArena, heap: &mut MemoryArena,
term_buffer: &mut Term, term_buffer: &mut Term,
) -> u32 { ) -> u32 {
term_buffer.set_field(Type::Str, field); term_buffer.set_field(field);
let mut sink = |token: &Token| { let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16. // We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN { if token.text.len() <= MAX_TOKEN_LEN {
@@ -279,7 +280,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
) -> UnorderedTermId { ) -> UnorderedTermId {
debug_assert!(term.as_slice().len() >= 4); debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1; self.total_num_tokens += 1;
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| { term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder { if let Some(mut recorder) = opt_recorder {
let current_doc = recorder.current_doc(); let current_doc = recorder.current_doc();
if current_doc != doc { if current_doc != doc {
@@ -299,17 +300,17 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
fn serialize( fn serialize(
&self, &self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], term_addrs: &[(&[u8], Addr, UnorderedTermId)],
serializer: &mut FieldSerializer<'_>, serializer: &mut FieldSerializer<'_>,
termdict_heap: &MemoryArena, termdict_heap: &MemoryArena,
heap: &MemoryArena, heap: &MemoryArena,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
let mut buffer_lender = BufferLender::default(); let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs { for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = termdict_heap.read(*addr); let recorder: Rec = termdict_heap.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term.value_bytes(), term_doc_freq)?; serializer.new_term(&term_bytes[4..], term_doc_freq)?;
recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map); recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map);
serializer.close_term()?; serializer.close_term()?;
} }

View File

@@ -1,5 +1,5 @@
use crate::docset::DocSet; use crate::docset::DocSet;
use crate::fastfield::AliveBitSet; use crate::fastfield::DeleteBitSet;
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::branchless_binary_search; use crate::postings::branchless_binary_search;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -34,7 +34,7 @@ impl SegmentPostings {
/// ///
/// This method will clone and scan through the posting lists. /// This method will clone and scan through the posting lists.
/// (this is a rather expensive operation). /// (this is a rather expensive operation).
pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 { pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 {
let mut docset = self.clone(); let mut docset = self.clone();
let mut doc_freq = 0; let mut doc_freq = 0;
loop { loop {
@@ -42,7 +42,7 @@ impl SegmentPostings {
if doc == TERMINATED { if doc == TERMINATED {
return doc_freq; return doc_freq;
} }
if alive_bitset.is_alive(doc) { if delete_bitset.is_alive(doc) {
doc_freq += 1u32; doc_freq += 1u32;
} }
docset.advance(); docset.advance();
@@ -268,7 +268,7 @@ mod tests {
use common::HasLen; use common::HasLen;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::AliveBitSet; use crate::fastfield::DeleteBitSet;
use crate::postings::postings::Postings; use crate::postings::postings::Postings;
#[test] #[test]
@@ -296,10 +296,9 @@ mod tests {
fn test_doc_freq() { fn test_doc_freq() {
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
assert_eq!(docs.doc_freq(), 3); assert_eq!(docs.doc_freq(), 3);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12); let delete_bitset = DeleteBitSet::for_test(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2); assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2);
let all_deleted = let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
} }
} }

View File

@@ -13,7 +13,6 @@ use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{DocId, Score}; use crate::{DocId, Score};
use common::CountingWriter; use common::CountingWriter;
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
use fail::fail_point;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::io::{self, Write}; use std::io::{self, Write};
@@ -213,9 +212,6 @@ impl<'a> FieldSerializer<'a> {
/// If the current block is incomplete, it need to be encoded /// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding. /// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> { pub fn close_term(&mut self) -> io::Result<()> {
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{:?}", msg)))
});
if self.term_open { if self.term_open {
self.postings_serializer self.postings_serializer
.close_term(self.current_term_info.doc_freq)?; .close_term(self.current_term_info.doc_freq)?;
@@ -308,8 +304,10 @@ pub struct PostingsSerializer<W: Write> {
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<Bm25Weight>, bm25_weight: Option<Bm25Weight>,
num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment. avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information. // this value is used to compute the block wand information.
} }
impl<W: Write> PostingsSerializer<W> { impl<W: Write> PostingsSerializer<W> {
@@ -319,6 +317,10 @@ impl<W: Write> PostingsSerializer<W> {
mode: IndexRecordOption, mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> { ) -> PostingsSerializer<W> {
let num_docs = fieldnorm_reader
.as_ref()
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
.unwrap_or(0u32);
PostingsSerializer { PostingsSerializer {
output_write: CountingWriter::wrap(write), output_write: CountingWriter::wrap(write),
@@ -333,33 +335,21 @@ impl<W: Write> PostingsSerializer<W> {
fieldnorm_reader, fieldnorm_reader,
bm25_weight: None, bm25_weight: None,
num_docs,
avg_fieldnorm, avg_fieldnorm,
} }
} }
pub fn new_term(&mut self, term_doc_freq: u32) { pub fn new_term(&mut self, term_doc_freq: u32) {
self.bm25_weight = None; if self.mode.has_freq() && self.num_docs > 0 {
let bm25_weight = Bm25Weight::for_one_term(
if !self.mode.has_freq() { term_doc_freq as u64,
return; self.num_docs as u64,
self.avg_fieldnorm,
);
self.bm25_weight = Some(bm25_weight);
} }
let num_docs_in_segment: u64 =
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
fieldnorm_reader.num_docs() as u64
} else {
return;
};
if num_docs_in_segment == 0 {
return;
}
self.bm25_weight = Some(Bm25Weight::for_one_term(
term_doc_freq as u64,
num_docs_in_segment,
self.avg_fieldnorm,
));
} }
fn write_block(&mut self) { fn write_block(&mut self) {

View File

@@ -186,6 +186,7 @@ mod tests {
use super::*; use super::*;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
#[test]
#[test] #[test]
fn test_stack() { fn test_stack() {
let mut heap = MemoryArena::new(); let mut heap = MemoryArena::new();

View File

@@ -3,7 +3,6 @@ use murmurhash32::murmurhash2;
use super::{Addr, MemoryArena}; use super::{Addr, MemoryArena};
use crate::postings::stacker::memory_arena::store; use crate::postings::stacker::memory_arena::store;
use crate::postings::UnorderedTermId; use crate::postings::UnorderedTermId;
use crate::Term;
use byteorder::{ByteOrder, NativeEndian}; use byteorder::{ByteOrder, NativeEndian};
use std::iter; use std::iter;
use std::mem; use std::mem;
@@ -82,13 +81,13 @@ pub struct Iter<'a> {
} }
impl<'a> Iterator for Iter<'a> { impl<'a> Iterator for Iter<'a> {
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId); type Item = (&'a [u8], Addr, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| { self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket]; let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr); let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
(Term::wrap(key), offset, kv.unordered_term_id) (key, offset, kv.unordered_term_id)
}) })
} }
} }
@@ -149,10 +148,6 @@ impl TermHashMap {
unordered_term_id unordered_term_id
} }
pub fn len(&self) -> usize {
self.len
}
pub fn iter(&self) -> Iter<'_> { pub fn iter(&self) -> Iter<'_> {
Iter { Iter {
inner: self.occupied.iter(), inner: self.occupied.iter(),
@@ -190,19 +185,21 @@ impl TermHashMap {
/// will be in charge of returning a default value. /// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed /// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`. /// `Some(previous_value)`.
pub fn mutate_or_create<V, TMutator>( pub fn mutate_or_create<S, V, TMutator>(
&mut self, &mut self,
key: &[u8], key: S,
mut updater: TMutator, mut updater: TMutator,
) -> UnorderedTermId ) -> UnorderedTermId
where where
S: AsRef<[u8]>,
V: Copy + 'static, V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V, TMutator: FnMut(Option<V>) -> V,
{ {
if self.is_saturated() { if self.is_saturated() {
self.resize(); self.resize();
} }
let hash = murmurhash2(key); let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2(key.as_ref());
let mut probe = self.probe(hash); let mut probe = self.probe(hash);
loop { loop {
let bucket = probe.next_probe(); let bucket = probe.next_probe();
@@ -210,18 +207,21 @@ impl TermHashMap {
if kv.is_empty() { if kv.is_empty() {
// The key does not exists yet. // The key does not exists yet.
let val = updater(None); let val = updater(None);
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>(); let num_bytes =
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
let key_addr = self.heap.allocate_space(num_bytes); let key_addr = self.heap.allocate_space(num_bytes);
{ {
let data = self.heap.slice_mut(key_addr, num_bytes); let data = self.heap.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key.len() as u16); NativeEndian::write_u16(data, key_bytes.len() as u16);
let stop = 2 + key.len(); let stop = 2 + key_bytes.len();
data[2..stop].copy_from_slice(key); data[2..stop].copy_from_slice(key_bytes);
store(&mut data[stop..], val); store(&mut data[stop..], val);
} }
return self.set_bucket(hash, key_addr, bucket); return self.set_bucket(hash, key_addr, bucket);
} else if kv.hash == hash { } else if kv.hash == hash {
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) { if let Some(val_addr) =
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
{
let v = self.heap.read(val_addr); let v = self.heap.read(val_addr);
let new_v = updater(Some(v)); let new_v = updater(Some(v));
self.heap.write_at(val_addr, new_v); self.heap.write_at(val_addr, new_v);
@@ -241,18 +241,25 @@ mod tests {
#[test] #[test]
fn test_hash_map() { fn test_hash_map() {
let mut hash_map: TermHashMap = TermHashMap::new(18); let mut hash_map: TermHashMap = TermHashMap::new(18);
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| { {
assert_eq!(opt_val, None); hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
3u32 assert_eq!(opt_val, None);
}); 3u32
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| { });
assert_eq!(opt_val, None); }
4u32 {
}); hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| { assert_eq!(opt_val, None);
assert_eq!(opt_val, Some(3u32)); 4u32
5u32 });
}); }
{
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, Some(3u32));
5u32
});
}
let mut vanilla_hash_map = HashMap::new(); let mut vanilla_hash_map = HashMap::new();
let iter_values = hash_map.iter(); let iter_values = hash_map.iter();
for (key, addr, _) in iter_values { for (key, addr, _) in iter_values {

View File

@@ -78,29 +78,29 @@ mod tests {
use crate::schema::{Schema, TEXT}; use crate::schema::{Schema, TEXT};
use crate::Index; use crate::Index;
fn create_test_index() -> crate::Result<Index> { fn create_test_index() -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_text_field("text", TEXT); let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>"aaa"))?; index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb"))?; index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc"))?; index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit()?; index_writer.commit().unwrap();
Ok(index) index
} }
#[test] #[test]
fn test_all_query() -> crate::Result<()> { fn test_all_query() {
let index = create_test_index()?; let index = create_test_index();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(&searcher, false)?; let weight = AllQuery.weight(&searcher, false).unwrap();
{ {
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0)?; let mut scorer = weight.scorer(reader, 1.0).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), 1u32); assert_eq!(scorer.advance(), 1u32);
assert_eq!(scorer.doc(), 1u32); assert_eq!(scorer.doc(), 1u32);
@@ -108,30 +108,28 @@ mod tests {
} }
{ {
let reader = searcher.segment_reader(1); let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader, 1.0)?; let mut scorer = weight.scorer(reader, 1.0).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), TERMINATED); assert_eq!(scorer.advance(), TERMINATED);
} }
Ok(())
} }
#[test] #[test]
fn test_all_query_with_boost() -> crate::Result<()> { fn test_all_query_with_boost() {
let index = create_test_index()?; let index = create_test_index();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(&searcher, false)?; let weight = AllQuery.weight(&searcher, false).unwrap();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
{ {
let mut scorer = weight.scorer(reader, 2.0)?; let mut scorer = weight.scorer(reader, 2.0).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 2.0); assert_eq!(scorer.score(), 2.0);
} }
{ {
let mut scorer = weight.scorer(reader, 1.5)?; let mut scorer = weight.scorer(reader, 1.5).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.5); assert_eq!(scorer.score(), 1.5);
} }
Ok(())
} }
} }

View File

@@ -92,16 +92,16 @@ mod tests {
use crate::Index; use crate::Index;
use tantivy_fst::Automaton; use tantivy_fst::Automaton;
fn create_index() -> crate::Result<Index> { fn create_index() -> Index {
let mut schema = Schema::builder(); let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING); let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build()); let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc"))?; index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd"))?; index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd"))?; index_writer.add_document(doc!(title=>"abcd"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
Ok(index) index
} }
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
@@ -140,32 +140,34 @@ mod tests {
} }
#[test] #[test]
fn test_automaton_weight() -> crate::Result<()> { fn test_automaton_weight() {
let index = create_index()?; let index = create_index();
let field = index.schema().get_field("title").unwrap(); let field = index.schema().get_field("title").unwrap();
let automaton_weight = AutomatonWeight::new(field, PrefixedByA); let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.0)?; let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.0); assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), 2u32); assert_eq!(scorer.advance(), 2u32);
assert_eq!(scorer.doc(), 2u32); assert_eq!(scorer.doc(), 2u32);
assert_eq!(scorer.score(), 1.0); assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), TERMINATED); assert_eq!(scorer.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_automaton_weight_boost() -> crate::Result<()> { fn test_automaton_weight_boost() {
let index = create_index()?; let index = create_index();
let field = index.schema().get_field("title").unwrap(); let field = index.schema().get_field("title").unwrap();
let automaton_weight = AutomatonWeight::new(field, PrefixedByA); let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.32)?; let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.32)
.unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.32); assert_eq!(scorer.score(), 1.32);
Ok(())
} }
} }

View File

@@ -42,39 +42,27 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc)) Some((before_pivot_len, pivot_len, pivot_doc))
} }
/// Advance the scorer with best score among the scorers[..pivot_len] to // Before and after calling this method, scorers need to be sorted by their `.doc()`.
/// the next doc candidate defined by the min of `last_doc_in_block + 1` for
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer( fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>, scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize, pivot_len: usize,
) { ) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1; let mut scorer_to_seek = pivot_len - 1;
let mut global_max_score = scorers[scorer_to_seek].max_score; let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block();
for scorer_ord in (0..pivot_len - 1).rev() { for scorer_ord in (0..pivot_len - 1).rev() {
let scorer = &scorers[scorer_ord]; let scorer = &scorers[scorer_ord];
if scorer.last_doc_in_block() <= doc_to_seek_after { if scorer.last_doc_in_block() <= doc_to_seek_after {
doc_to_seek_after = scorer.last_doc_in_block(); doc_to_seek_after = scorer.last_doc_in_block();
}
if scorers[scorer_ord].max_score > global_max_score {
global_max_score = scorers[scorer_ord].max_score;
scorer_to_seek = scorer_ord; scorer_to_seek = scorer_ord;
} }
} }
// Add +1 to go to the next block unless we are already at the end.
if doc_to_seek_after != TERMINATED {
doc_to_seek_after += 1;
}
for scorer in &scorers[pivot_len..] { for scorer in &scorers[pivot_len..] {
if scorer.doc() <= doc_to_seek_after { if scorer.doc() <= doc_to_seek_after {
doc_to_seek_after = scorer.doc(); doc_to_seek_after = scorer.doc();
} }
} }
scorers[scorer_to_seek].seek(doc_to_seek_after); scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek); restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
} }
@@ -142,9 +130,6 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>,
term_scorers.sort_by_key(|scorer| scorer.doc()); term_scorers.sort_by_key(|scorer| scorer.doc());
} }
/// Implements the WAND (Weak AND) algorithm for dynamic pruning
/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes".
/// Link: http://engineering.nyu.edu/~suel/papers/bmw.pdf
pub fn block_wand( pub fn block_wand(
mut scorers: Vec<TermScorer>, mut scorers: Vec<TermScorer>,
mut threshold: Score, mut threshold: Score,
@@ -202,7 +187,6 @@ pub fn block_wand(
.iter_mut() .iter_mut()
.map(|scorer| scorer.score()) .map(|scorer| scorer.score())
.sum(); .sum();
if score > threshold { if score > threshold {
threshold = callback(pivot_doc, score); threshold = callback(pivot_doc, score);
} }
@@ -211,56 +195,6 @@ pub fn block_wand(
} }
} }
/// Specialized version of [`block_wand`] for a single scorer.
/// In this case, the algorithm is simple and readable and faster (~ x3)
/// than the generic algorithm.
/// The algorithm behaves as follows:
/// - While we don't hit the end of the docset:
/// - While the block max score is under the `threshold`, go to the
/// next block.
/// - On a block, advance until the end and execute `callback``
/// when the doc score is greater or equal to the `threshold`.
pub fn block_wand_single_scorer(
mut scorer: TermScorer,
mut threshold: Score,
callback: &mut dyn FnMut(u32, Score) -> Score,
) {
let mut doc = scorer.doc();
loop {
// We position the scorer on a block that can reach
// the threshold.
while scorer.block_max_score() < threshold {
let last_doc_in_block = scorer.last_doc_in_block();
if last_doc_in_block == TERMINATED {
return;
}
doc = last_doc_in_block + 1;
scorer.shallow_seek(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
if doc == TERMINATED {
break;
}
loop {
let score = scorer.score();
if score > threshold {
threshold = callback(doc, score);
}
debug_assert!(doc <= scorer.last_doc_in_block());
if doc == scorer.last_doc_in_block() {
break;
}
doc = scorer.advance();
if doc == TERMINATED {
return;
}
}
doc += 1;
scorer.shallow_seek(doc);
}
}
struct TermScorerWithMaxScore<'a> { struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer, scorer: &'a mut TermScorer,
max_score: Score, max_score: Score,
@@ -338,14 +272,13 @@ mod tests {
} }
fn compute_checkpoints_for_each_pruning( fn compute_checkpoints_for_each_pruning(
mut term_scorers: Vec<TermScorer>, term_scorers: Vec<TermScorer>,
n: usize, n: usize,
) -> Vec<(DocId, Score)> { ) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n); let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new(); let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut limit: Score = 0.0; let mut limit: Score = 0.0;
super::block_wand(term_scorers, Score::MIN, &mut |doc, score| {
let callback = &mut |doc, score| {
heap.push(Float(score)); heap.push(Float(score));
if heap.len() > n { if heap.len() > n {
heap.pop().unwrap(); heap.pop().unwrap();
@@ -357,14 +290,7 @@ mod tests {
checkpoints.push((doc, score)); checkpoints.push((doc, score));
} }
limit limit
}; });
if term_scorers.len() == 1 {
let scorer = term_scorers.pop().unwrap();
super::block_wand_single_scorer(scorer, Score::MIN, callback);
} else {
super::block_wand(term_scorers, Score::MIN, callback);
}
checkpoints checkpoints
} }
@@ -498,14 +424,6 @@ mod tests {
} }
} }
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_block_wand_single_term_scorer((posting_lists, fieldnorms) in gen_term_scorers(1)) {
test_block_wand_aux(&posting_lists[..], &fieldnorms[..]);
}
}
#[test] #[test]
fn test_fn_reproduce_proptest() { fn test_fn_reproduce_proptest() {
let postings_lists = &[ let postings_lists = &[

View File

@@ -41,22 +41,22 @@ use std::collections::BTreeMap;
/// let mut index_writer = index.writer(3_000_000)?; /// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Name of the Wind", /// title => "The Name of the Wind",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of Muadib", /// title => "The Diary of Muadib",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "A Dairy Cow", /// title => "A Dairy Cow",
/// body => "hidden", /// body => "hidden",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "A Dairy Cow", /// title => "A Dairy Cow",
/// body => "found", /// body => "found",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl", /// title => "The Diary of a Young Girl",
/// ))?; /// ));
/// index_writer.commit()?; /// index_writer.commit().unwrap();
/// } /// }
/// ///
/// let reader = index.reader()?; /// let reader = index.reader()?;
@@ -217,11 +217,11 @@ mod tests {
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?; let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(text=>"b c"))?; writer.add_document(doc!(text=>"b c"));
writer.add_document(doc!(text=>"a c"))?; writer.add_document(doc!(text=>"a c"));
writer.add_document(doc!(text=>"a b"))?; writer.add_document(doc!(text=>"a b"));
writer.add_document(doc!(text=>"a d"))?; writer.add_document(doc!(text=>"a d"));
writer.commit()?; writer.commit()?;
Ok(index) Ok(index)
} }

View File

@@ -3,7 +3,6 @@ mod boolean_query;
mod boolean_weight; mod boolean_weight;
pub(crate) use self::block_wand::block_wand; pub(crate) use self::block_wand::block_wand;
pub(crate) use self::block_wand::block_wand_single_scorer;
pub use self::boolean_query::BooleanQuery; pub use self::boolean_query::BooleanQuery;
#[cfg(test)] #[cfg(test)]
@@ -26,75 +25,72 @@ mod tests {
use crate::Index; use crate::Index;
use crate::{DocAddress, DocId, Score}; use crate::{DocAddress, DocId, Score};
fn aux_test_helper() -> crate::Result<(Index, Field)> { fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"))?; index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"))?; index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"))?; index_writer.add_document(doc!(text_field => "b c"));
index_writer.add_document(doc!(text_field => "a b c d"))?; index_writer.add_document(doc!(text_field => "a b c d"));
index_writer.add_document(doc!(text_field => "d"))?; index_writer.add_document(doc!(text_field => "d"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
Ok((index, text_field)) (index, text_field)
} }
#[test] #[test]
pub fn test_boolean_non_all_term_disjunction() -> crate::Result<()> { pub fn test_boolean_non_all_term_disjunction() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("(+a +b) d")?; let query = query_parser.parse_query("(+a +b) d").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
assert_eq!(query.count(&searcher)?, 3); assert_eq!(query.count(&searcher).unwrap(), 3);
Ok(())
} }
#[test] #[test]
pub fn test_boolean_single_must_clause() -> crate::Result<()> { pub fn test_boolean_single_must_clause() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("+a")?; let query = query_parser.parse_query("+a").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
Ok(())
} }
#[test] #[test]
pub fn test_boolean_termonly_intersection() -> crate::Result<()> { pub fn test_boolean_termonly_intersection() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
{ {
let query = query_parser.parse_query("+a +b +c")?; let query = query_parser.parse_query("+a +b +c").unwrap();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<Intersection<TermScorer>>()); assert!(scorer.is::<Intersection<TermScorer>>());
} }
{ {
let query = query_parser.parse_query("+a +(b c)")?; let query = query_parser.parse_query("+a +(b c)").unwrap();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>()); assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_reqopt() -> crate::Result<()> { pub fn test_boolean_reqopt() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
{ {
let query = query_parser.parse_query("+a b")?; let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<RequiredOptionalScorer< assert!(scorer.is::<RequiredOptionalScorer<
Box<dyn Scorer>, Box<dyn Scorer>,
Box<dyn Scorer>, Box<dyn Scorer>,
@@ -102,17 +98,16 @@ mod tests {
>>()); >>());
} }
{ {
let query = query_parser.parse_query("+a b")?; let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, false)?; let weight = query.weight(&searcher, false).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_query() -> crate::Result<()> { pub fn test_boolean_query() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| { let make_term_query = |text: &str| {
let term_query = TermQuery::new( let term_query = TermQuery::new(
@@ -123,7 +118,7 @@ mod tests {
query query
}; };
let reader = index.reader()?; let reader = index.reader().unwrap();
let matching_docs = |boolean_query: &dyn Query| { let matching_docs = |boolean_query: &dyn Query| {
reader reader
@@ -170,12 +165,11 @@ mod tests {
let boolean_query = BooleanQuery::new(vec![(Occur::MustNot, make_term_query("d"))]); let boolean_query = BooleanQuery::new(vec![(Occur::MustNot, make_term_query("d"))]);
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new()); assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_query_two_excluded() -> crate::Result<()> { pub fn test_boolean_query_two_excluded() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| { let make_term_query = |text: &str| {
let term_query = TermQuery::new( let term_query = TermQuery::new(
@@ -186,7 +180,7 @@ mod tests {
query query
}; };
let reader = index.reader()?; let reader = index.reader().unwrap();
let matching_topdocs = |query: &dyn Query| { let matching_topdocs = |query: &dyn Query| {
reader reader
@@ -219,21 +213,20 @@ mod tests {
assert_eq!(top_doc, DocAddress::new(0, 4)); assert_eq!(top_doc, DocAddress::new(0, 4));
assert_eq!(top_score, score_doc_4); assert_eq!(top_score, score_doc_4);
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_query_with_weight() -> crate::Result<()> { pub fn test_boolean_query_with_weight() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"))?; index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"))?; index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"))?; index_writer.add_document(doc!(text_field => "b c"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let term_a: Box<dyn Query> = Box::new(TermQuery::new( let term_a: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "a"),
@@ -249,21 +242,24 @@ mod tests {
BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]); BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let boolean_weight = boolean_query.weight(&searcher, true).unwrap(); let boolean_weight = boolean_query.weight(&searcher, true).unwrap();
{ {
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 1.0)?; let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32); assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445); assert_nearly_equals!(boolean_scorer.score(), 0.84163445);
} }
{ {
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 2.0)?; let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32); assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689); assert_nearly_equals!(boolean_scorer.score(), 1.6832689);
} }
Ok(())
} }
#[test] #[test]
pub fn test_intersection_score() -> crate::Result<()> { pub fn test_intersection_score() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| { let make_term_query = |text: &str| {
let term_query = TermQuery::new( let term_query = TermQuery::new(
@@ -273,7 +269,7 @@ mod tests {
let query: Box<dyn Query> = Box::new(term_query); let query: Box<dyn Query> = Box::new(term_query);
query query
}; };
let reader = index.reader()?; let reader = index.reader().unwrap();
let score_docs = |boolean_query: &dyn Query| { let score_docs = |boolean_query: &dyn Query| {
let fruit = reader let fruit = reader
.searcher() .searcher()
@@ -291,7 +287,6 @@ mod tests {
assert_nearly_equals!(scores[0], 0.977973); assert_nearly_equals!(scores[0], 0.977973);
assert_nearly_equals!(scores[1], 0.84699446); assert_nearly_equals!(scores[1], 0.84699446);
} }
Ok(())
} }
#[test] #[test]
@@ -301,8 +296,8 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?;
index_writer.add_document(doc!(text=>"a"))?; index_writer.add_document(doc!(text=>"a"));
index_writer.add_document(doc!(text=>"b"))?; index_writer.add_document(doc!(text=>"b"));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let term_a: Box<dyn Query> = Box::new(TermQuery::new( let term_a: Box<dyn Query> = Box::new(TermQuery::new(

View File

@@ -1,4 +1,4 @@
use crate::fastfield::AliveBitSet; use crate::fastfield::DeleteBitSet;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight}; use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term}; use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
@@ -118,8 +118,8 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
self.underlying.size_hint() self.underlying.size_hint()
} }
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
self.underlying.count(alive_bitset) self.underlying.count(delete_bitset)
} }
fn count_including_deleted(&mut self) -> u32 { fn count_including_deleted(&mut self) -> u32 {
@@ -141,20 +141,19 @@ mod tests {
use crate::{DocAddress, Document, Index}; use crate::{DocAddress, Document, Index};
#[test] #[test]
fn test_boost_query_explain() -> crate::Result<()> { fn test_boost_query_explain() {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new())?; index_writer.add_document(Document::new());
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let query = BoostQuery::new(Box::new(AllQuery), 0.2); let query = BoostQuery::new(Box::new(AllQuery), 0.2);
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap(); let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
assert_eq!( assert_eq!(
explanation.to_pretty_json(), explanation.to_pretty_json(),
"{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}" "{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}"
); )
Ok(())
} }
} }

Some files were not shown because too many files have changed in this diff Show More