Compare commits

..

2 Commits

Author SHA1 Message Date
Paul Masurel
89f91b1b58 first stab 2021-10-06 12:10:16 +09:00
Paul Masurel
19965c46bc Added wasm-mt 2021-10-06 10:45:17 +09:00
138 changed files with 2764 additions and 5086 deletions

View File

@@ -6,10 +6,3 @@ updates:
interval: daily interval: daily
time: "20:00" time: "20:00"
open-pull-requests-limit: 10 open-pull-requests-limit: 10
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: daily
time: "20:00"
open-pull-requests-limit: 10

View File

@@ -18,7 +18,7 @@ jobs:
- name: Generate code coverage - name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v2 uses: codecov/codecov-action@v1
with: with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info files: lcov.info

View File

@@ -21,10 +21,10 @@ jobs:
- name: Install latest nightly to test also against unstable feature flag - name: Install latest nightly to test also against unstable feature flag
uses: actions-rs/toolchain@v1 uses: actions-rs/toolchain@v1
with: with:
toolchain: stable toolchain: nightly
override: true override: true
components: rustfmt components: rustfmt
- name: Run tests - name: Run tests
run: cargo test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace run: cargo test --all-features --verbose --workspace
- name: Check Formatting - name: Check Formatting
run: cargo fmt --all -- --check run: cargo fmt --all -- --check

1
.gitignore vendored
View File

@@ -1,5 +1,4 @@
tantivy.iml tantivy.iml
.cargo
proptest-regressions proptest-regressions
*.swp *.swp
target target

92
.travis.yml Normal file
View File

@@ -0,0 +1,92 @@
# Based on the "trust" template v0.1.2
# https://github.com/japaric/trust/tree/v0.1.2
dist: trusty
language: rust
services: docker
sudo: required
env:
global:
- CRATE_NAME=tantivy
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- kalakris-cmake
packages:
- gcc-4.8
- g++-4.8
- libcurl4-openssl-dev
- libelf-dev
- libdw-dev
- binutils-dev
- cmake
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
# Linux
#- env: TARGET=aarch64-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
# OSX
#- env: TARGET=x86_64-apple-darwin
# os: osx
before_install:
- set -e
- rustup self update
- rustup component add rustfmt
install:
- sh ci/install.sh
- source ~/.cargo/env || true
- env | grep "TRAVIS"
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
- cargo install cargo-update || echo "cargo-update already installed"
- cargo install cargo-travis || echo "cargo-travis already installed"
script:
- bash ci/script.sh
- cargo fmt --all -- --check
before_deploy:
- sh ci/before_deploy.sh
after_success:
# Needs GH_TOKEN env var to be set in travis settings
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
#cache: cargo
#before_cache:
# # Travis can't cache files that are not readable by "others"
# - chmod -R a+r $HOME/.cargo
# - find ./target/debug -type f -maxdepth 1 -delete
# - rm -f ./target/.rustc_info.json
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
# - rm -r target/debug/examples/
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
#branches:
# only:
# # release tags
# - /^v\d+\.\d+\.\d+.*$/
# - master
notifications:
email:
on_success: never

View File

@@ -1,21 +1,6 @@
Tantivy 0.17
================================
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar) [#115](https://github.com/quickwit-inc/tantivy/issues/115)
- Adds a searcher Warmer API (@shikhar)
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 .
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224)
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-inc/tantivy/issues/922)
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225)
Tantivy 0.16.2
================================
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
Tantivy 0.16.1 Tantivy 0.16.1
======================== ========================
- Major Bugfix on multivalued fastfield. #1151 - Major Bugfix on multivalued fastfield. #1151
- Demux operation (@PSeitz)
Tantivy 0.16.0 Tantivy 0.16.0
========================= =========================
@@ -128,7 +113,7 @@ Tantivy 0.12.0
## How to update? ## How to update?
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
minor changes. Check https://github.com/quickwit-inc/tantivy/blob/main/examples/custom_tokenizer.rs minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
to check for some code sample. to check for some code sample.
Tantivy 0.11.3 Tantivy 0.11.3

View File

@@ -1,13 +1,13 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.17.0-dev" version = "0.16.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
description = """Search engine library""" description = """Search engine library"""
documentation = "https://docs.rs/tantivy/" documentation = "https://docs.rs/tantivy/"
homepage = "https://github.com/quickwit-inc/tantivy" homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/quickwit-inc/tantivy" repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"
@@ -20,12 +20,13 @@ once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] } regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3" tantivy-fst = "0.3"
memmap2 = {version = "0.5", optional=true} memmap2 = {version = "0.5", optional=true}
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true } lz4_flex = { version = "0.9.0", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true } brotli = { version = "3.3", optional = true }
snap = { version = "1.0.5", optional = true } snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.2", optional = true } tempfile = { version = "3.2", optional = true }
log = "0.4.14" log = "0.4.14"
serde = { version = "1.0.126", features = ["derive"] } serde = { version = "1.0.126", features = ["derive"] }
serde_closure = "0.3"
serde_json = "1.0.64" serde_json = "1.0.64"
num_cpus = "1.13" num_cpus = "1.13"
fs2={ version = "0.4.3", optional = true } fs2={ version = "0.4.3", optional = true }
@@ -37,7 +38,7 @@ tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" } tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version = "0.1", path = "./common/", package = "tantivy-common" } common = { version = "0.1", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false } fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.2", path="./ownedbytes" } ownedbytes = { version="0.1", path="./ownedbytes" }
stable_deref_trait = "1.2" stable_deref_trait = "1.2"
rust-stemmers = "1.2" rust-stemmers = "1.2"
downcast-rs = "1.2" downcast-rs = "1.2"
@@ -46,15 +47,16 @@ census = "0.4"
fnv = "1.0.7" fnv = "1.0.7"
thiserror = "1.0.24" thiserror = "1.0.24"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = "0.5" fail = "0.4"
murmurhash32 = "0.2" murmurhash32 = "0.2"
chrono = "0.4.19" chrono = "0.4.19"
smallvec = "1.6.1" smallvec = "1.6.1"
rayon = "1.5"
lru = "0.7.0" lru = "0.7.0"
fastdivide = "0.3" fastdivide = "0.3"
itertools = "0.10.0" itertools = "0.10.0"
measure_time = "0.8.0" measure_time = "0.7.0"
wasm-mt = "0.1"
wasm-mt-pool = "0.1"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
@@ -65,11 +67,11 @@ maplit = "1.0.2"
matches = "0.1.8" matches = "0.1.8"
proptest = "1.0" proptest = "1.0"
criterion = "0.3.5" criterion = "0.3.5"
test-log = "0.2.8" test-env-log = "0.2.7"
env_logger = "0.9.0" env_logger = "0.9.0"
[dev-dependencies.fail] [dev-dependencies.fail]
version = "0.5" version = "0.4"
features = ["failpoints"] features = ["failpoints"]
[profile.release] [profile.release]
@@ -91,6 +93,7 @@ snappy-compression = ["snap"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace] [workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"] members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]

View File

@@ -1,8 +1,8 @@
[![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/) [![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/)
[![Build Status](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml) [![Build Status](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml)
[![codecov](https://codecov.io/gh/quickwit-inc/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/quickwit-inc/tantivy) [![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Join the chat at https://discord.gg/MT27AG5EVE](https://shields.io/discord/908281611840282624?label=chat%20on%20discord)](https://discord.gg/MT27AG5EVE) [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy) [![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
@@ -17,6 +17,9 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6) [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7) [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in Rust. **Tantivy** is a **full text search engine library** written in Rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
@@ -75,12 +78,13 @@ It walks you through getting a wikipedia search engine up and running in a few m
There are many ways to support this project. There are many ways to support this project.
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com) - Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Report bugs - Report bugs
- Write a blog post - Write a blog post
- Help with documentation by asking questions or submitting PRs - Help with documentation by asking questions or submitting PRs
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE)) - Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
- Talk about Tantivy around you - Talk about Tantivy around you
- [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
# Contributing code # Contributing code
@@ -92,7 +96,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run: To check out and run tests, you can simply run:
```bash ```bash
git clone https://github.com/quickwit-inc/tantivy.git git clone https://github.com/tantivy-search/tantivy.git
cd tantivy cd tantivy
cargo build cargo build
``` ```

View File

@@ -6,7 +6,7 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = [] categories = []
description = """Tantivy-sub crate: bitpacking""" description = """Tantivy-sub crate: bitpacking"""
repository = "https://github.com/quickwit-inc/tantivy" repository = "https://github.com/tantivy-search/tantivy"
keywords = [] keywords = []

View File

@@ -10,7 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies] [dependencies]
byteorder = "1.4.3" byteorder = "1.4.3"
ownedbytes = { version="0.2", path="../ownedbytes" } ownedbytes = { version="0.1", path="../ownedbytes" }
[dev-dependencies] [dev-dependencies]
proptest = "1.0.0" proptest = "1.0.0"

View File

@@ -36,14 +36,10 @@ impl TinySet {
writer.write_all(self.0.to_le_bytes().as_ref()) writer.write_all(self.0.to_le_bytes().as_ref())
} }
pub fn into_bytes(self) -> [u8; 8] {
self.0.to_le_bytes()
}
#[inline] #[inline]
pub fn deserialize(data: [u8; 8]) -> Self { pub fn deserialize(data: [u8; 8]) -> io::Result<Self> {
let val: u64 = u64::from_le_bytes(data); let val: u64 = u64::from_le_bytes(data);
TinySet(val) Ok(TinySet(val))
} }
/// Returns an empty `TinySet`. /// Returns an empty `TinySet`.
@@ -62,30 +58,29 @@ impl TinySet {
self.0 = 0u64; self.0 = 0u64;
} }
#[inline]
/// Returns the complement of the set in `[0, 64[`. /// Returns the complement of the set in `[0, 64[`.
/// ///
/// Careful on making this function public, as it will break the padding handling in the last /// Careful on making this function public, as it will break the padding handling in the last
/// bucket. /// bucket.
#[inline]
fn complement(self) -> TinySet { fn complement(self) -> TinySet {
TinySet(!self.0) TinySet(!self.0)
} }
/// Returns true iff the `TinySet` contains the element `el`.
#[inline] #[inline]
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool { pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty() !self.intersect(TinySet::singleton(el)).is_empty()
} }
/// Returns the number of elements in the TinySet.
#[inline] #[inline]
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 { pub fn len(self) -> u32 {
self.0.count_ones() self.0.count_ones()
} }
/// Returns the intersection of `self` and `other`
#[inline] #[inline]
#[must_use] /// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet { pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0) TinySet(self.0 & other.0)
} }
@@ -99,14 +94,12 @@ impl TinySet {
/// Insert a new element within [0..64) /// Insert a new element within [0..64)
#[inline] #[inline]
#[must_use]
pub fn insert(self, el: u32) -> TinySet { pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el)) self.union(TinySet::singleton(el))
} }
/// Removes an element within [0..64) /// Removes an element within [0..64)
#[inline] #[inline]
#[must_use]
pub fn remove(self, el: u32) -> TinySet { pub fn remove(self, el: u32) -> TinySet {
self.intersect(TinySet::singleton(el).complement()) self.intersect(TinySet::singleton(el).complement())
} }
@@ -133,7 +126,6 @@ impl TinySet {
/// Returns the union of two tinysets /// Returns the union of two tinysets
#[inline] #[inline]
#[must_use]
pub fn union(self, other: TinySet) -> TinySet { pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0) TinySet(self.0 | other.0)
} }
@@ -190,20 +182,42 @@ impl BitSet {
/// ///
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> { pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.max_value.to_le_bytes().as_ref())?; writer.write_all(self.max_value.to_le_bytes().as_ref())?;
for tinyset in self.tinysets.iter().cloned() {
writer.write_all(&tinyset.into_bytes())?; for tinyset in self.tinysets.iter() {
tinyset.serialize(writer)?;
} }
writer.flush()?; writer.flush()?;
Ok(()) Ok(())
} }
/// Deserialize a `BitSet`.
///
#[cfg(test)]
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
data = &data[4..];
let mut len: u64 = 0;
let mut tinysets = vec![];
for chunk in data.chunks_exact(8) {
let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?;
len += tinyset.len() as u64;
tinysets.push(tinyset);
}
Ok(BitSet {
tinysets: tinysets.into_boxed_slice(),
len,
max_value,
})
}
/// Create a new `BitSet` that may contain elements /// Create a new `BitSet` that may contain elements
/// within `[0, max_val)`. /// within `[0, max_val)`.
pub fn with_max_value(max_value: u32) -> BitSet { pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value); let num_buckets = num_buckets(max_value);
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice(); let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet { BitSet {
tinysets: tinybitsets, tinysets: tinybisets,
len: 0, len: 0,
max_value, max_value,
} }
@@ -213,15 +227,14 @@ impl BitSet {
/// within `[0, max_val)`. /// within `[0, max_val)`.
pub fn with_max_value_and_full(max_value: u32) -> BitSet { pub fn with_max_value_and_full(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value); let num_buckets = num_buckets(max_value);
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice(); let mut tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
// Fix padding // Fix padding
let lower = max_value % 64u32; let lower = max_value % 64u32;
if lower != 0 { tinybisets[tinybisets.len() - 1] = TinySet::range_lower(lower);
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
}
BitSet { BitSet {
tinysets: tinybitsets, tinysets: tinybisets,
len: max_value as u64, len: max_value as u64,
max_value, max_value,
} }
@@ -234,22 +247,7 @@ impl BitSet {
} }
} }
/// Intersect with serialized bitset
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
self.intersect_update_with_iter(other.iter_tinysets());
}
/// Intersect with tinysets
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
self.len = 0;
for (left, right) in self.tinysets.iter_mut().zip(other) {
*left = left.intersect(right);
self.len += left.len() as u64;
}
}
/// Returns the number of elements in the `BitSet`. /// Returns the number of elements in the `BitSet`.
#[inline]
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
self.len as usize self.len as usize
} }
@@ -299,7 +297,6 @@ impl BitSet {
.map(|delta_bucket| bucket + delta_bucket as u32) .map(|delta_bucket| bucket + delta_bucket as u32)
} }
#[inline]
pub fn max_value(&self) -> u32 { pub fn max_value(&self) -> u32 {
self.max_value self.max_value
} }
@@ -314,34 +311,16 @@ impl BitSet {
/// Serialized BitSet. /// Serialized BitSet.
#[derive(Clone)] #[derive(Clone)]
pub struct ReadOnlyBitSet { pub struct ReadSerializedBitSet {
data: OwnedBytes, data: OwnedBytes,
max_value: u32, max_value: u32,
} }
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet { impl ReadSerializedBitSet {
assert_eq!(left.max_value(), other.max_value());
assert_eq!(left.data.len(), other.data.len());
let union_tinyset_it = left
.iter_tinysets()
.zip(other.iter_tinysets())
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
for tinyset in union_tinyset_it {
output_dataset.extend_from_slice(&tinyset.into_bytes());
}
ReadOnlyBitSet {
data: OwnedBytes::new(output_dataset),
max_value: left.max_value(),
}
}
impl ReadOnlyBitSet {
pub fn open(data: OwnedBytes) -> Self { pub fn open(data: OwnedBytes) -> Self {
let (max_value_data, data) = data.split(4); let (max_value_data, data) = data.split(4);
assert_eq!(data.len() % 8, 0);
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap()); let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
ReadOnlyBitSet { data, max_value } ReadSerializedBitSet { data, max_value }
} }
/// Number of elements in the bitset. /// Number of elements in the bitset.
@@ -355,9 +334,10 @@ impl ReadOnlyBitSet {
/// Iterate the tinyset on the fly from serialized data. /// Iterate the tinyset on the fly from serialized data.
/// ///
#[inline] #[inline]
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ { fn iter_tinysets<'a>(&'a self) -> impl Iterator<Item = TinySet> + 'a {
assert!((self.data.len()) % 8 == 0);
self.data.chunks_exact(8).map(move |chunk| { self.data.chunks_exact(8).map(move |chunk| {
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()); let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
tinyset tinyset
}) })
} }
@@ -365,7 +345,7 @@ impl ReadOnlyBitSet {
/// Iterate over the positions of the elements. /// Iterate over the positions of the elements.
/// ///
#[inline] #[inline]
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ { pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
self.iter_tinysets() self.iter_tinysets()
.enumerate() .enumerate()
.flat_map(move |(chunk_num, tinyset)| { .flat_map(move |(chunk_num, tinyset)| {
@@ -395,56 +375,20 @@ impl ReadOnlyBitSet {
pub fn max_value(&self) -> u32 { pub fn max_value(&self) -> u32 {
self.max_value self.max_value
} }
/// Number of bytes used in the bitset representation.
pub fn num_bytes(&self) -> usize {
self.data.len()
}
}
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
bitset
.serialize(&mut buffer)
.expect("serializing into a buffer should never fail");
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::BitSet; use super::BitSet;
use super::ReadOnlyBitSet; use super::ReadSerializedBitSet;
use super::TinySet; use super::TinySet;
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use rand::distributions::Bernoulli; use rand::distributions::Bernoulli;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
use std::collections::HashSet; use std::collections::HashSet;
use std::convert::TryInto;
#[test]
fn test_read_serialized_bitset_full_multi() {
for i in 0..1000 {
let bitset = BitSet::with_max_value_and_full(i);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len() as usize, i as usize);
}
}
#[test]
fn test_read_serialized_bitset_full_block() {
let bitset = BitSet::with_max_value_and_full(64);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len() as usize, 64 as usize);
}
#[test] #[test]
fn test_read_serialized_bitset_full() { fn test_read_serialized_bitset_full() {
@@ -453,50 +397,10 @@ mod tests {
let mut out = vec![]; let mut out = vec![];
bitset.serialize(&mut out).unwrap(); bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out)); let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 4); assert_eq!(bitset.len(), 4);
} }
#[test]
fn test_bitset_intersect() {
let bitset_serialized = {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(1);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
ReadOnlyBitSet::open(OwnedBytes::new(out))
};
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(1);
bitset.intersect_update(&bitset_serialized);
assert!(bitset.contains(0));
assert!(!bitset.contains(1));
assert!(bitset.contains(2));
assert!(!bitset.contains(3));
assert!(bitset.contains(4));
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
assert!(bitset.contains(0));
assert!(!bitset.contains(1));
assert!(!bitset.contains(2));
assert!(!bitset.contains(3));
assert!(!bitset.contains(4));
assert_eq!(bitset.len(), 1);
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
assert!(!bitset.contains(0));
assert!(!bitset.contains(1));
assert!(!bitset.contains(2));
assert!(!bitset.contains(3));
assert!(!bitset.contains(4));
assert_eq!(bitset.len(), 0);
}
#[test] #[test]
fn test_read_serialized_bitset_empty() { fn test_read_serialized_bitset_empty() {
let mut bitset = BitSet::with_max_value(5); let mut bitset = BitSet::with_max_value(5);
@@ -504,14 +408,14 @@ mod tests {
let mut out = vec![]; let mut out = vec![];
bitset.serialize(&mut out).unwrap(); bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out)); let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 1); assert_eq!(bitset.len(), 1);
{ {
let bitset = BitSet::with_max_value(5); let bitset = BitSet::with_max_value(5);
let mut out = vec![]; let mut out = vec![];
bitset.serialize(&mut out).unwrap(); bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out)); let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 0); assert_eq!(bitset.len(), 0);
} }
} }
@@ -575,9 +479,13 @@ mod tests {
assert!(u.pop_lowest().is_none()); assert!(u.pop_lowest().is_none());
} }
{ {
let original = TinySet::empty().insert(63u32).insert(5); let u = TinySet::empty().insert(63u32).insert(5);
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes()); let mut data = vec![];
assert_eq!(original, after_serialize_deserialize); u.serialize(&mut data).unwrap();
let mut u = TinySet::deserialize(data[..8].try_into().unwrap()).unwrap();
assert_eq!(u.pop_lowest(), Some(5u32));
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
} }
} }
@@ -599,12 +507,12 @@ mod tests {
// test deser // test deser
let mut data = vec![]; let mut data = vec![];
bitset.serialize(&mut data).unwrap(); bitset.serialize(&mut data).unwrap();
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data)); let bitset = BitSet::deserialize(&data).unwrap();
for el in 0..max_value { for el in 0..max_value {
assert_eq!(hashset.contains(&el), ro_bitset.contains(el)); assert_eq!(hashset.contains(&el), bitset.contains(el));
} }
assert_eq!(ro_bitset.max_value(), max_value); assert_eq!(bitset.max_value(), max_value);
assert_eq!(ro_bitset.len(), els.len()); assert_eq!(bitset.len(), els.len());
}; };
test_against_hashset(&[], 0); test_against_hashset(&[], 0);

View File

@@ -1,5 +1,3 @@
#![allow(clippy::len_without_is_empty)]
use std::ops::Deref; use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness; pub use byteorder::LittleEndian as Endianness;

View File

@@ -54,7 +54,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
} }
} }
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly /// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
/// ///
/// The point is that while the type is public, it cannot be built by anyone /// The point is that while the type is public, it cannot be built by anyone
/// outside of this module. /// outside of this module.

View File

@@ -38,7 +38,7 @@ Note: Tantivy 0.16 does not do this optimization yet.
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?) In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
## Usage ## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used. The index sorting can be configured setting [`sort_by_field`](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
``` ```
let settings = IndexSettings { let settings = IndexSettings {
@@ -55,7 +55,7 @@ let index = index_builder.create_in_ram().unwrap();
## Implementation details ## Implementation details
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073). Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets). In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).

View File

@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
); );
// ... and add it to the `IndexWriter`. // ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc)?; index_writer.add_document(old_man_doc);
// For convenience, tantivy also comes with a macro to // For convenience, tantivy also comes with a macro to
// reduce the boilerplate above. // reduce the boilerplate above.
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
enterprise which you have regarded with such evil forebodings. I arrived here \ enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \ yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking." increasing confidence in the success of my undertaking."
))?; ));
// This is an example, so we will only index 3 documents // This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index // here. You can check out tantivy's tutorial to index

View File

@@ -145,23 +145,23 @@ fn main() -> tantivy::Result<()> {
product_description => "While it is ok for short distance travel, this broom \ product_description => "While it is ok for short distance travel, this broom \
was designed quiditch. It will up your game.", was designed quiditch. It will up your game.",
price => 30_200u64 price => 30_200u64
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Turbulobroom", product_name => "Turbulobroom",
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\ product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
You'll enjoy its sharp turns, and rapid acceleration", You'll enjoy its sharp turns, and rapid acceleration",
price => 29_240u64 price => 29_240u64
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Broomio", product_name => "Broomio",
product_description => "Great value for the price. This broom is a market favorite", product_description => "Great value for the price. This broom is a market favorite",
price => 21_240u64 price => 21_240u64
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Whack a Mole", product_name => "Whack a Mole",
product_description => "Prime quality bat.", product_description => "Prime quality bat.",
price => 5_200u64 price => 5_200u64
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish." he had gone eighty-four days now without taking a fish."
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the fresh and green with every spring, carrying in their lower leaf junctures the
debris of the winters flooding; and sycamores with mottled, white, recumbent debris of the winters flooding; and sycamores with mottled, white, recumbent
limbs and branches that arch over the pool"# limbs and branches that arch over the pool"#
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and yesterday, and my first task is to assure my dear sister of my welfare and
increasing confidence in the success of my undertaking."# increasing confidence in the success of my undertaking."#
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -76,15 +76,15 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
isbn => "978-0099908401", isbn => "978-0099908401",
title => "The old Man and the see" title => "The old Man and the see"
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
isbn => "978-0140177398", isbn => "978-0140177398",
title => "Of Mice and Men", title => "Of Mice and Men",
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankentein", //< Oops there is a typo here. title => "Frankentein", //< Oops there is a typo here.
isbn => "978-9176370711", isbn => "978-9176370711",
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
isbn => "978-9176370711", isbn => "978-9176370711",
))?; ));
// You are guaranteed that your clients will only observe your index in // You are guaranteed that your clients will only observe your index in
// the state it was in after a commit. // the state it was in after a commit.

View File

@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let name = schema_builder.add_text_field("felin_name", TEXT | STORED); let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
// this is our faceted field: its scientific classification // this is our faceted field: its scientific classification
let classification = schema_builder.add_facet_field("classification", FacetOptions::default()); let classification = schema_builder.add_facet_field("classification", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Cat", name => "Cat",
classification => Facet::from("/Felidae/Felinae/Felis") classification => Facet::from("/Felidae/Felinae/Felis")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Canada lynx", name => "Canada lynx",
classification => Facet::from("/Felidae/Felinae/Lynx") classification => Facet::from("/Felidae/Felinae/Lynx")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Cheetah", name => "Cheetah",
classification => Facet::from("/Felidae/Felinae/Acinonyx") classification => Facet::from("/Felidae/Felinae/Acinonyx")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Tiger", name => "Tiger",
classification => Facet::from("/Felidae/Pantherinae/Panthera") classification => Facet::from("/Felidae/Pantherinae/Panthera")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Lion", name => "Lion",
classification => Facet::from("/Felidae/Pantherinae/Panthera") classification => Facet::from("/Felidae/Pantherinae/Panthera")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Jaguar", name => "Jaguar",
classification => Facet::from("/Felidae/Pantherinae/Panthera") classification => Facet::from("/Felidae/Pantherinae/Panthera")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Sunda clouded leopard", name => "Sunda clouded leopard",
classification => Facet::from("/Felidae/Pantherinae/Neofelis") classification => Facet::from("/Felidae/Pantherinae/Neofelis")
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Fossa", name => "Fossa",
classification => Facet::from("/Eupleridae/Cryptoprocta") classification => Facet::from("/Eupleridae/Cryptoprocta")
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", STORED); let title = schema_builder.add_text_field("title", STORED);
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default()); let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
title => "Fried egg", title => "Fried egg",
ingredient => Facet::from("/ingredient/egg"), ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/oil"), ingredient => Facet::from("/ingredient/oil"),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Scrambled egg", title => "Scrambled egg",
ingredient => Facet::from("/ingredient/egg"), ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/butter"), ingredient => Facet::from("/ingredient/butter"),
ingredient => Facet::from("/ingredient/milk"), ingredient => Facet::from("/ingredient/milk"),
ingredient => Facet::from("/ingredient/salt"), ingredient => Facet::from("/ingredient/salt"),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Egg rolls", title => "Egg rolls",
ingredient => Facet::from("/ingredient/egg"), ingredient => Facet::from("/ingredient/egg"),
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
ingredient => Facet::from("/ingredient/oil"), ingredient => Facet::from("/ingredient/oil"),
ingredient => Facet::from("/ingredient/tortilla-wrap"), ingredient => Facet::from("/ingredient/tortilla-wrap"),
ingredient => Facet::from("/ingredient/mushroom"), ingredient => Facet::from("/ingredient/mushroom"),
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED}; use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, Result}; use tantivy::{doc, Index, Result};
fn main() -> Result<()> { fn run() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field // For the sake of simplicity, this schema will only have 1 field
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -19,7 +19,7 @@ fn main() -> Result<()> {
{ {
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 { for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?; index_writer.add_document(doc!(year_field => year));
} }
index_writer.commit()?; index_writer.commit()?;
// The index will be a range of years // The index will be a range of years
@@ -33,3 +33,7 @@ fn main() -> Result<()> {
assert_eq!(num_60s_books, 10); assert_eq!(num_60s_books, 10);
Ok(()) Ok(())
} }
fn main() {
run().unwrap()
}

View File

@@ -25,9 +25,9 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?; index_writer.add_document(doc!(title => "The Old Man and the Sea"));
index_writer.add_document(doc!(title => "Of Mice and Men"))?; index_writer.add_document(doc!(title => "Of Mice and Men"));
index_writer.add_document(doc!(title => "The modern Promotheus"))?; index_writer.add_document(doc!(title => "The modern Promotheus"));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;

View File

@@ -29,7 +29,7 @@ use std::sync::{Arc, RwLock};
use std::thread; use std::thread;
use std::time::Duration; use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT}; use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError}; use tantivy::{doc, Index, IndexWriter, Opstamp};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -59,11 +59,10 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
println!("add doc {} from thread 1 - opstamp {}", i, opstamp); println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20)); thread::sleep(Duration::from_millis(20));
} }
Result::<(), TantivyError>::Ok(())
}); });
// # Second indexing thread. // # Second indexing thread.
@@ -79,12 +78,11 @@ fn main() -> tantivy::Result<()> {
index_writer_rlock.add_document(doc!( index_writer_rlock.add_document(doc!(
title => "Manufacturing consent", title => "Manufacturing consent",
body => "Some great book description..." body => "Some great book description..."
))? ))
}; };
println!("add doc {} from thread 2 - opstamp {}", i, opstamp); println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10)); thread::sleep(Duration::from_millis(10));
} }
Result::<(), TantivyError>::Ok(())
}); });
// # In the main thread, we commit 10 times, once every 500ms. // # In the main thread, we commit 10 times, once every 500ms.
@@ -92,7 +90,7 @@ fn main() -> tantivy::Result<()> {
let opstamp: Opstamp = { let opstamp: Opstamp = {
// Committing or rollbacking on the other hand requires write lock. This will block other threads. // Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap(); let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit()? index_writer_wlock.commit().unwrap()
}; };
println!("committed with opstamp {}", opstamp); println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500)); thread::sleep(Duration::from_millis(500));

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
let old_man_doc = doc!(title => title_tok, body => body_tok); let old_man_doc = doc!(title => title_tok, body => body_tok);
// ... now let's just add it to the IndexWriter // ... now let's just add it to the IndexWriter
index_writer.add_document(old_man_doc)?; index_writer.add_document(old_man_doc);
// Pretokenized text can also be fed as JSON // Pretokenized text can also be fed as JSON
let short_man_json = r#"{ let short_man_json = r#"{
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
let short_man_doc = schema.parse_document(short_man_json)?; let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?; index_writer.add_document(short_man_doc);
// Let's commit changes // Let's commit changes
index_writer.commit()?; index_writer.commit()?;
@@ -106,7 +106,9 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?; let (top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
assert_eq!(count, 2); assert_eq!(count, 2);
@@ -127,7 +129,9 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?; let (_top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
assert_eq!(count, 0); assert_eq!(count, 0);

View File

@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
// ... // ...
index_writer.commit()?; index_writer.commit()?;
@@ -70,13 +70,13 @@ fn highlight(snippet: Snippet) -> String {
let mut start_from = 0; let mut start_from = 0;
for fragment_range in snippet.highlighted() { for fragment_range in snippet.highlighted() {
result.push_str(&snippet.fragment()[start_from..fragment_range.start]); result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
result.push_str(" --> "); result.push_str(" --> ");
result.push_str(&snippet.fragment()[fragment_range.clone()]); result.push_str(&snippet.fragments()[fragment_range.clone()]);
result.push_str(" <-- "); result.push_str(" <-- ");
start_from = fragment_range.end; start_from = fragment_range.end;
} }
result.push_str(&snippet.fragment()[start_from..]); result.push_str(&snippet.fragments()[start_from..]);
result result
} }

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish." he had gone eighty-four days now without taking a fish."
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
enterprise which you have regarded with such evil forebodings. I arrived here \ enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \ yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking." increasing confidence in the success of my undertaking."
))?; ));
index_writer.commit()?; index_writer.commit()?;

View File

@@ -1,223 +0,0 @@
use std::cmp::Reverse;
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak};
use tantivy::collector::TopDocs;
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, TEXT};
use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader, TrackedObject};
use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer};
// This example shows how warmers can be used to
// load a values from an external sources using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
type ProductId = u64;
/// Price
type Price = u32;
pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
struct DynamicPriceColumn {
field: Field,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
impl DynamicPriceColumn {
pub fn with_product_id_field<T: PriceFetcher>(field: Field, price_fetcher: T) -> Self {
DynamicPriceColumn {
field,
price_cache: Default::default(),
price_fetcher: Box::new(price_fetcher),
}
}
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
self.price_cache.read().unwrap().get(&segment_key).cloned()
}
}
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment
.doc_ids_alive()
.map(|doc| product_id_reader.get(doc))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(price_vals));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]) {
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
.collect();
}
}
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
/// This map represents a map (ProductId -> Price)
///
/// In practise, it could be fetching things from an external service, like a SQL table.
///
#[derive(Default, Clone)]
pub struct ExternalPriceTable {
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
}
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices_read = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
fn main() -> tantivy::Result<()> {
// Declaring our schema.
let mut schema_builder = Schema::builder();
// The product id is assumed to be a primary id for our external price source.
let product_id = schema_builder.add_u64_field("product_id", FAST);
let text = schema_builder.add_text_field("text", TEXT);
let schema: Schema = schema_builder.build();
let price_table = ExternalPriceTable::default();
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
product_id,
price_table.clone(),
));
price_table.update_price(OLIVE_OIL, 12);
price_table.update_price(GLOVES, 13);
price_table.update_price(SNEAKERS, 80);
const OLIVE_OIL: ProductId = 323423;
const GLOVES: ProductId = 3966623;
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index
.reader_builder()
.warmers(warmers)
.num_searchers(1)
.try_into()?;
reader.reload()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;
let searcher = reader.searcher();
let score_by_price = move |segment_reader: &SegmentReader| {
let price = price_dynamic_column
.price_for_segment(segment_reader)
.unwrap();
move |doc_id: DocId| Reverse(price[doc_id as usize])
};
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
let hits = searcher.search(&query, &most_expensive_first)?;
assert_eq!(
&hits,
&[
(
Reverse(12u32),
DocAddress {
segment_ord: 0,
doc_id: 0u32
}
),
(
Reverse(13u32),
DocAddress {
segment_ord: 0,
doc_id: 1u32
}
),
]
);
// Olive oil just got more expensive!
price_table.update_price(OLIVE_OIL, 15);
// The price update are directly reflected on `reload`.
//
// Be careful here though!...
// You may have spotted that we are still using the same `Searcher`.
//
// It is up to the `Warmer` implementer to decide how
// to control this behavior.
reader.reload()?;
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
assert_eq!(
&hits_with_new_prices,
&[
(
Reverse(13u32),
DocAddress {
segment_ord: 0,
doc_id: 1u32
}
),
(
Reverse(15u32),
DocAddress {
segment_ord: 0,
doc_id: 0u32
}
),
]
);
Ok(())
}

View File

@@ -1,7 +1,7 @@
[package] [package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"] authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes" name = "ownedbytes"
version = "0.2.0" version = "0.1.0"
edition = "2018" edition = "2018"
description = "Expose data as static slice" description = "Expose data as static slice"
license = "MIT" license = "MIT"

View File

@@ -1,5 +1,3 @@
#![allow(clippy::return_self_not_must_use)]
use stable_deref_trait::StableDeref; use stable_deref_trait::StableDeref;
use std::convert::TryInto; use std::convert::TryInto;
use std::mem; use std::mem;
@@ -37,8 +35,6 @@ impl OwnedBytes {
} }
/// creates a fileslice that is just a view over a slice of the data. /// creates a fileslice that is just a view over a slice of the data.
#[must_use]
#[inline]
pub fn slice(&self, range: Range<usize>) -> Self { pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes { OwnedBytes {
data: &self.data[range], data: &self.data[range],
@@ -67,8 +63,6 @@ impl OwnedBytes {
/// On the other hand, both `left` and `right` retain a handle over /// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only /// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped. /// be released when both left and right are dropped.
#[inline]
#[must_use]
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) { pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone(); let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes { let left = OwnedBytes {
@@ -82,19 +76,6 @@ impl OwnedBytes {
(left, right) (left, right)
} }
/// Splits the right part of the `OwnedBytes` at the given offset.
///
/// `self` is truncated to `split_len`, left with the remaining bytes.
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
let right_box_stable_deref = self.box_stable_deref.clone();
let right_piece = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
self.data = &self.data[..split_len];
right_piece
}
/// Returns true iff this `OwnedBytes` is empty. /// Returns true iff this `OwnedBytes` is empty.
#[inline] #[inline]
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
@@ -103,6 +84,7 @@ impl OwnedBytes {
/// Drops the left most `advance_len` bytes. /// Drops the left most `advance_len` bytes.
/// ///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline] #[inline]
pub fn advance(&mut self, advance_len: usize) { pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..] self.data = &self.data[advance_len..]
@@ -142,35 +124,6 @@ impl fmt::Debug for OwnedBytes {
} }
} }
impl PartialEq for OwnedBytes {
fn eq(&self, other: &OwnedBytes) -> bool {
self.as_slice() == other.as_slice()
}
}
impl Eq for OwnedBytes {}
impl PartialEq<[u8]> for OwnedBytes {
fn eq(&self, other: &[u8]) -> bool {
self.as_slice() == other
}
}
impl PartialEq<str> for OwnedBytes {
fn eq(&self, other: &str) -> bool {
self.as_slice() == other.as_bytes()
}
}
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
where
OwnedBytes: PartialEq<T>,
{
fn eq(&self, other: &&'a T) -> bool {
*self == **other
}
}
impl Deref for OwnedBytes { impl Deref for OwnedBytes {
type Target = [u8]; type Target = [u8];
@@ -334,14 +287,4 @@ mod tests {
assert_eq!(right.as_slice(), b""); assert_eq!(right.as_slice(), b"");
} }
} }
#[test]
fn test_split_off() {
let mut data = OwnedBytes::new(b"abcdef".as_ref());
assert_eq!(data, "abcdef");
assert_eq!(data.split_off(2), "cdef");
assert_eq!(data, "ab");
assert_eq!(data.split_off(1), "b");
assert_eq!(data, "a");
}
} }

View File

@@ -5,9 +5,9 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
description = """Search engine library""" description = """Search engine library"""
documentation = "https://quickwit-inc.github.io/tantivy/tantivy/index.html" documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/quickwit-inc/tantivy" homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/quickwit-inc/tantivy" repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"

View File

@@ -91,7 +91,6 @@ pub enum UserInputAst {
} }
impl UserInputAst { impl UserInputAst {
#[must_use]
pub fn unary(self, occur: Occur) -> UserInputAst { pub fn unary(self, occur: Occur) -> UserInputAst {
UserInputAst::Clause(vec![(Some(occur), self)]) UserInputAst::Clause(vec![(Some(occur), self)])
} }

View File

@@ -20,10 +20,10 @@ use crate::SegmentReader;
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer(3_000_000).unwrap(); /// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap(); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap(); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap(); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap(); /// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// assert!(index_writer.commit().is_ok()); /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader().unwrap(); /// let reader = index.reader().unwrap();

View File

@@ -83,7 +83,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// ```rust /// ```rust
/// use tantivy::collector::FacetCollector; /// use tantivy::collector::FacetCollector;
/// use tantivy::query::AllQuery; /// use tantivy::query::AllQuery;
/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT}; /// use tantivy::schema::{Facet, Schema, INDEXED, TEXT};
/// use tantivy::{doc, Index}; /// use tantivy::{doc, Index};
/// ///
/// fn example() -> tantivy::Result<()> { /// fn example() -> tantivy::Result<()> {
@@ -92,7 +92,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// // Facet have their own specific type. /// // Facet have their own specific type.
/// // It is not a bad practise to put all of your /// // It is not a bad practise to put all of your
/// // facet information in the same field. /// // facet information in the same field.
/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default()); /// let facet = schema_builder.add_facet_field("facet", INDEXED);
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
@@ -103,23 +103,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// title => "The Name of the Wind", /// title => "The Name of the Wind",
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/fantasy") /// facet => Facet::from("/category/fiction/fantasy")
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "Dune", /// title => "Dune",
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/sci-fi") /// facet => Facet::from("/category/fiction/sci-fi")
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "La Vénus d'Ille", /// title => "La Vénus d'Ille",
/// facet => Facet::from("/lang/fr"), /// facet => Facet::from("/lang/fr"),
/// facet => Facet::from("/category/fiction/fantasy"), /// facet => Facet::from("/category/fiction/fantasy"),
/// facet => Facet::from("/category/fiction/horror") /// facet => Facet::from("/category/fiction/horror")
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl", /// title => "The Diary of a Young Girl",
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography") /// facet => Facet::from("/category/biography")
/// ))?; /// ));
/// index_writer.commit()?; /// index_writer.commit()?;
/// } /// }
/// let reader = index.reader()?; /// let reader = index.reader()?;
@@ -400,7 +400,7 @@ impl<'a> Iterator for FacetChildIterator<'a> {
impl FacetCounts { impl FacetCounts {
/// Returns an iterator over all of the facet count pairs inside this result. /// Returns an iterator over all of the facet count pairs inside this result.
/// See the documentation for [FacetCollector] for a usage example. /// See the documentation for `FacetCollector` for a usage example.
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_> pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
where where
Facet: From<T>, Facet: From<T>,
@@ -421,7 +421,7 @@ impl FacetCounts {
} }
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts. /// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
/// See the documentation for [FacetCollector] for a usage example. /// See the documentation for `FacetCollector` for a usage example.
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)> pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
where where
Facet: From<T>, Facet: From<T>,
@@ -462,7 +462,7 @@ mod tests {
use crate::collector::Count; use crate::collector::Count;
use crate::core::Index; use crate::core::Index;
use crate::query::{AllQuery, QueryParser, TermQuery}; use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema}; use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED};
use crate::Term; use crate::Term;
use rand::distributions::Uniform; use rand::distributions::Uniform;
use rand::prelude::SliceRandom; use rand::prelude::SliceRandom;
@@ -470,13 +470,13 @@ mod tests {
use std::iter; use std::iter;
#[test] #[test]
fn test_facet_collector_drilldown() -> crate::Result<()> { fn test_facet_collector_drilldown() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5; let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets) let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| { .map(|mut n| {
@@ -491,14 +491,14 @@ mod tests {
for i in 0..num_facets * 10 { for i in 0..num_facets * 10 {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone()); doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1")); facet_collector.add_facet(Facet::from("/top1"));
let counts = searcher.search(&AllQuery, &facet_collector)?; let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
{ {
let facets: Vec<(String, u64)> = counts let facets: Vec<(String, u64)> = counts
@@ -518,7 +518,6 @@ mod tests {
.collect::<Vec<_>>() .collect::<Vec<_>>()
); );
} }
Ok(())
} }
#[test] #[test]
@@ -531,49 +530,48 @@ mod tests {
} }
#[test] #[test]
fn test_doc_unsorted_multifacet() -> crate::Result<()> { fn test_doc_unsorted_multifacet() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facets", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(), facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(), facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(), facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(), facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
))?; ));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1); assert_eq!(searcher.num_docs(), 1);
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/subjects"); facet_collector.add_facet("/subjects");
let counts = searcher.search(&AllQuery, &facet_collector)?; let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect(); let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
assert_eq!(facets[0].1, 1); assert_eq!(facets[0].1, 1);
Ok(())
} }
#[test] #[test]
fn test_doc_search_by_facet() -> crate::Result<()> { fn test_doc_search_by_facet() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A").unwrap(), facet_field => Facet::from_text(&"/A/A").unwrap(),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/B").unwrap(), facet_field => Facet::from_text(&"/A/B").unwrap(),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/C/A").unwrap(), facet_field => Facet::from_text(&"/A/C/A").unwrap(),
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A").unwrap(), facet_field => Facet::from_text(&"/D/C/A").unwrap(),
))?; ));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -615,7 +613,7 @@ mod tests {
#[test] #[test]
fn test_facet_collector_topk() { fn test_facet_collector_topk() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -639,7 +637,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
@@ -664,7 +662,7 @@ mod tests {
#[test] #[test]
fn test_facet_collector_topk_tie_break() -> crate::Result<()> { fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -679,7 +677,7 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for doc in docs { for doc in docs {
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; index_writer.commit()?;
@@ -727,7 +725,7 @@ mod bench {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();

View File

@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::Field; use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError}; use crate::{Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate. /// The `FilterCollector` collector filters docs using a fast field value and a predicate.
/// Only the documents for which the predicate returned "true" will be passed on to the next collector. /// Only the documents for which the predicate returned "true" will be passed on to the next collector.
/// ///
/// ```rust /// ```rust
@@ -25,37 +25,34 @@ use crate::{Score, SegmentReader, TantivyError};
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST}; /// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::{doc, DocAddress, Index}; /// use tantivy::{doc, DocAddress, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST); /// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?; /// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?; /// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?; /// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?; /// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
/// index_writer.commit()?; /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader()?; /// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary").unwrap();
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2)); /// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?; /// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
/// ///
/// assert_eq!(top_docs.len(), 1); /// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// ///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2)); /// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?; /// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
/// ///
/// assert_eq!(filtered_top_docs.len(), 0); /// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ``` /// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue> pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
where where

View File

@@ -226,10 +226,10 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?; let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64))?; writer.add_document(doc!(val_field=>12i64));
writer.add_document(doc!(val_field=>-30i64))?; writer.add_document(doc!(val_field=>-30i64));
writer.add_document(doc!(val_field=>-12i64))?; writer.add_document(doc!(val_field=>-12i64));
writer.add_document(doc!(val_field=>-10i64))?; writer.add_document(doc!(val_field=>-10i64));
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -247,13 +247,13 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?; let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64))?; writer.add_document(doc!(val_field=>12i64));
writer.commit()?; writer.commit()?;
writer.add_document(doc!(val_field=>-30i64))?; writer.add_document(doc!(val_field=>-30i64));
writer.commit()?; writer.commit()?;
writer.add_document(doc!(val_field=>-12i64))?; writer.add_document(doc!(val_field=>-12i64));
writer.commit()?; writer.commit()?;
writer.add_document(doc!(val_field=>-10i64))?; writer.add_document(doc!(val_field=>-10i64));
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -271,9 +271,9 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?; let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?; writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?; writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?; writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();

View File

@@ -48,10 +48,10 @@ use tantivy::collector::{Count, TopDocs};
# let mut index_writer = index.writer(3_000_000)?; # let mut index_writer = index.writer(3_000_000)?;
# index_writer.add_document(doc!( # index_writer.add_document(doc!(
# title => "The Name of the Wind", # title => "The Name of the Wind",
# ))?; # ));
# index_writer.add_document(doc!( # index_writer.add_document(doc!(
# title => "The Diary of Muadib", # title => "The Diary of Muadib",
# ))?; # ));
# index_writer.commit()?; # index_writer.commit()?;
# let reader = index.reader()?; # let reader = index.reader()?;
# let searcher = reader.searcher(); # let searcher = reader.searcher();

View File

@@ -112,19 +112,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index}; /// use tantivy::{doc, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
/// index_writer.commit()?;
/// ///
/// let reader = index.reader()?; /// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// assert!(index_writer.commit().is_ok());
///
/// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let mut collectors = MultiCollector::new(); /// let mut collectors = MultiCollector::new();
@@ -139,8 +139,6 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// ///
/// assert_eq!(count, 2); /// assert_eq!(count, 2);
/// assert_eq!(top_docs.len(), 2); /// assert_eq!(top_docs.len(), 2);
/// # Ok(())
/// # }
/// ``` /// ```
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
#[derive(Default)] #[derive(Default)]
@@ -254,24 +252,24 @@ mod tests {
use crate::Term; use crate::Term;
#[test] #[test]
fn test_multi_collector() -> crate::Result<()> { fn test_multi_collector() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc"))?; index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"))?; index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"))?; index_writer.add_document(doc!(text=>"abc abc"));
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""))?; index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"))?; index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"))?; index_writer.add_document(doc!(text=>"abc"));
index_writer.commit()?; index_writer.commit().unwrap();
} }
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let term = Term::from_field_text(text, "abc"); let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic); let query = TermQuery::new(term, IndexRecordOption::Basic);
@@ -282,6 +280,5 @@ mod tests {
assert_eq!(count_handler.extract(&mut multifruits), 5); assert_eq!(count_handler.extract(&mut multifruits), 5);
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2); assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
Ok(())
} }
} }

View File

@@ -25,7 +25,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
}; };
#[test] #[test]
pub fn test_filter_collector() -> crate::Result<()> { pub fn test_filter_collector() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let price = schema_builder.add_u64_field("price", FAST); let price = schema_builder.add_u64_field("price", FAST);
@@ -33,25 +33,25 @@ pub fn test_filter_collector() -> crate::Result<()> {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?; index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]); let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary")?; let query = query_parser.parse_query("diary").unwrap();
let filter_some_collector = FilterCollector::new( let filter_some_collector = FilterCollector::new(
price, price,
&|value: u64| value > 20_120u64, &|value: u64| value > 20_120u64,
TopDocs::with_limit(2), TopDocs::with_limit(2),
); );
let top_docs = searcher.search(&query, &filter_some_collector)?; let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
assert_eq!(top_docs.len(), 1); assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
@@ -67,10 +67,9 @@ pub fn test_filter_collector() -> crate::Result<()> {
} }
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5)); let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?; let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
assert_eq!(filtered_date_docs.len(), 2); assert_eq!(filtered_date_docs.len(), 2);
Ok(())
} }
/// Stores all of the doc ids. /// Stores all of the doc ids.
@@ -275,8 +274,8 @@ fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.commit()?; index_writer.commit()?;
Ok(index.reader()?.searcher()) Ok(index.reader()?.searcher())
} }

View File

@@ -70,7 +70,9 @@ where
/// # Panics /// # Panics
/// The method panics if limit is 0 /// The method panics if limit is 0
pub fn with_limit(limit: usize) -> TopCollector<T> { pub fn with_limit(limit: usize) -> TopCollector<T> {
assert!(limit >= 1, "Limit must be strictly greater than 0."); if limit < 1 {
panic!("Limit must be strictly greater than 0.");
}
Self { Self {
limit, limit,
offset: 0, offset: 0,

View File

@@ -94,30 +94,27 @@ where
/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index}; /// use tantivy::{doc, DocAddress, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?; /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?; /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?; /// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?; /// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.commit()?; /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader()?; /// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?; /// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
/// ///
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3)); /// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ``` /// ```
pub struct TopDocs(TopCollector<Score>); pub struct TopDocs(TopCollector<Score>);
@@ -183,34 +180,30 @@ impl TopDocs {
/// use tantivy::schema::{Schema, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index}; /// use tantivy::{doc, DocAddress, Index};
/// ///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?; /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?; /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?; /// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?; /// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?; /// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
/// index_writer.commit()?; /// assert!(index_writer.commit().is_ok());
/// ///
/// let reader = index.reader()?; /// let reader = index.reader().unwrap();
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?; /// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
/// ///
/// assert_eq!(top_docs.len(), 2); /// assert_eq!(top_docs.len(), 2);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3)); /// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// Ok(())
/// # }
/// ``` /// ```
#[must_use]
pub fn and_offset(self, offset: usize) -> TopDocs { pub fn and_offset(self, offset: usize) -> TopDocs {
TopDocs(self.0.and_offset(offset)) TopDocs(self.0.and_offset(offset))
} }
@@ -241,11 +234,11 @@ impl TopDocs {
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?; /// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?; /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?; /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?; /// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
/// # index_writer.commit()?; /// # assert!(index_writer.commit().is_ok());
/// # let reader = index.reader()?; /// # let reader = index.reader()?;
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?; /// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?; /// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
@@ -323,9 +316,9 @@ impl TopDocs {
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?; /// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?; /// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?; /// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
/// # assert!(index_writer.commit().is_ok()); /// # assert!(index_writer.commit().is_ok());
/// # let reader = index.reader()?; /// # let reader = index.reader()?;
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?; /// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
@@ -424,9 +417,9 @@ impl TopDocs {
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap(); /// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?; /// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?; /// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?; /// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
/// index_writer.commit()?; /// index_writer.commit()?;
/// Ok(index) /// Ok(index)
/// } /// }
@@ -534,9 +527,9 @@ impl TopDocs {
/// # /// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// let boosted: Field = index.schema().get_field("boosted").unwrap(); /// let boosted: Field = index.schema().get_field("boosted").unwrap();
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?; /// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?; /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?; /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
/// # index_writer.commit()?; /// # index_writer.commit()?;
/// // ... /// // ...
/// # let user_query = "diary"; /// # let user_query = "diary";
@@ -720,18 +713,20 @@ mod tests {
use crate::Score; use crate::Score;
use crate::{DocAddress, DocId, SegmentReader}; use crate::{DocAddress, DocId, SegmentReader};
fn make_index() -> crate::Result<Index> { fn make_index() -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; // writing the segment
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?; let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?; index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"I like Droopy"))?; index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.commit()?; index_writer.add_document(doc!(text_field=>"I like Droopy"));
Ok(index) assert!(index_writer.commit().is_ok());
}
index
} }
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) { fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
@@ -742,15 +737,17 @@ mod tests {
} }
#[test] #[test]
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> { fn test_top_collector_not_at_capacity_without_offset() {
let index = make_index()?; let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax")?; let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index let score_docs: Vec<(Score, DocAddress)> = index
.reader()? .reader()
.unwrap()
.searcher() .searcher()
.search(&text_query, &TopDocs::with_limit(4))?; .search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_results_equals( assert_results_equals(
&score_docs, &score_docs,
&[ &[
@@ -759,12 +756,11 @@ mod tests {
(0.48527452, DocAddress::new(0, 0)), (0.48527452, DocAddress::new(0, 0)),
], ],
); );
Ok(())
} }
#[test] #[test]
fn test_top_collector_not_at_capacity_with_offset() { fn test_top_collector_not_at_capacity_with_offset() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -779,7 +775,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_at_capacity() { fn test_top_collector_at_capacity() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -800,7 +796,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_at_capacity_with_offset() { fn test_top_collector_at_capacity_with_offset() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -821,7 +817,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_stable_sorting() { fn test_top_collector_stable_sorting() {
let index = make_index().unwrap(); let index = make_index();
// using AllQuery to get a constant score // using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
@@ -852,35 +848,29 @@ mod tests {
const SIZE: &str = "size"; const SIZE: &str = "size";
#[test] #[test]
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> { fn test_top_field_collector_not_at_capacity() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT); let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST); let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let (index, query) = index("beer", title, schema, |index_writer| { let (index, query) = index("beer", title, schema, |index_writer| {
index_writer index_writer.add_document(doc!(
.add_document(doc!( title => "bottle of beer",
title => "bottle of beer", size => 12u64,
size => 12u64, ));
)) index_writer.add_document(doc!(
.unwrap(); title => "growler of beer",
index_writer size => 64u64,
.add_document(doc!( ));
title => "growler of beer", index_writer.add_document(doc!(
size => 64u64, title => "pint of beer",
)) size => 16u64,
.unwrap(); ));
index_writer
.add_document(doc!(
title => "pint of beer",
size => 16u64,
))
.unwrap();
}); });
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size); let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?; let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
assert_eq!( assert_eq!(
&top_docs[..], &top_docs[..],
&[ &[
@@ -889,7 +879,6 @@ mod tests {
(12, DocAddress::new(0, 0)) (12, DocAddress::new(0, 0))
] ]
); );
Ok(())
} }
#[test] #[test]
@@ -905,12 +894,12 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Paul Robeson", name => "Paul Robeson",
birthday => pr_birthday birthday => pr_birthday
))?; ));
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?; let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
name => "Minnie Riperton", name => "Minnie Riperton",
birthday => mr_birthday birthday => mr_birthday
))?; ));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday); let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
@@ -937,11 +926,11 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "georgetown", city => "georgetown",
altitude => -1i64, altitude => -1i64,
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "tokyo", city => "tokyo",
altitude => 40i64, altitude => 40i64,
))?; ));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude); let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
@@ -967,11 +956,11 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "georgetown", city => "georgetown",
altitude => -1.0f64, altitude => -1.0f64,
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
city => "tokyo", city => "tokyo",
altitude => 40f64, altitude => 40f64,
))?; ));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude); let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
@@ -994,12 +983,10 @@ mod tests {
let size = schema_builder.add_u64_field(SIZE, FAST); let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let (index, _) = index("beer", title, schema, |index_writer| { let (index, _) = index("beer", title, schema, |index_writer| {
index_writer index_writer.add_document(doc!(
.add_document(doc!( title => "bottle of beer",
title => "bottle of beer", size => 12u64,
size => 12u64, ));
))
.unwrap();
}); });
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2)); let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
@@ -1016,7 +1003,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(size=>1u64))?; index_writer.add_document(doc!(size=>1u64));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0); let segment = searcher.segment_reader(0);
@@ -1033,7 +1020,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(size=>1u64))?; index_writer.add_document(doc!(size=>1u64));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0); let segment = searcher.segment_reader(0);
@@ -1046,26 +1033,30 @@ mod tests {
} }
#[test] #[test]
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> { fn test_tweak_score_top_collector_with_offset() {
let index = make_index()?; let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax")?; let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score( let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc, move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
); );
let score_docs: Vec<(u32, DocAddress)> = let score_docs: Vec<(u32, DocAddress)> = index
index.reader()?.searcher().search(&text_query, &collector)?; .reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
assert_eq!( assert_eq!(
score_docs, score_docs,
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),] vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
); );
Ok(())
} }
#[test] #[test]
fn test_custom_score_top_collector_with_offset() { fn test_custom_score_top_collector_with_offset() {
let index = make_index().unwrap(); let index = make_index();
let field = index.schema().get_field("text").unwrap(); let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]); let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap(); let text_query = query_parser.parse_query("droopy tax").unwrap();

View File

@@ -1,5 +1,4 @@
use crossbeam::channel; use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder};
/// Search executor whether search request are single thread or multithread. /// Search executor whether search request are single thread or multithread.
/// ///
@@ -11,8 +10,6 @@ use rayon::{ThreadPool, ThreadPoolBuilder};
pub enum Executor { pub enum Executor {
/// Single thread variant of an Executor /// Single thread variant of an Executor
SingleThread, SingleThread,
/// Thread pool variant of an Executor
ThreadPool(ThreadPool),
} }
impl Executor { impl Executor {
@@ -21,15 +18,6 @@ impl Executor {
Executor::SingleThread Executor::SingleThread
} }
/// Creates an Executor that dispatches the tasks in a thread pool.
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> {
let pool = ThreadPoolBuilder::new()
.num_threads(num_threads)
.thread_name(move |num| format!("{}{}", prefix, num))
.build()?;
Ok(Executor::ThreadPool(pool))
}
/// Perform a map in the thread pool. /// Perform a map in the thread pool.
/// ///
/// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task /// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
@@ -46,40 +34,6 @@ impl Executor {
) -> crate::Result<Vec<R>> { ) -> crate::Result<Vec<R>> {
match self { match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(), Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
Executor::ThreadPool(pool) => {
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
let num_fruits = args_with_indices.len();
let fruit_receiver = {
let (fruit_sender, fruit_receiver) = channel::unbounded();
pool.scope(|scope| {
for arg_with_idx in args_with_indices {
scope.spawn(|_| {
let (idx, arg) = arg_with_idx;
let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) {
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
}
});
}
});
fruit_receiver
// This ends the scope of fruit_sender.
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but safe.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
}
} }
} }
} }

View File

@@ -123,8 +123,8 @@ impl IndexBuilder {
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error. /// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> { pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?); let mmap_directory = MmapDirectory::open(directory_path)?;
if Index::exists(&*mmap_directory)? { if Index::exists(&mmap_directory)? {
return Err(TantivyError::IndexAlreadyExists); return Err(TantivyError::IndexAlreadyExists);
} }
self.create(mmap_directory) self.create(mmap_directory)
@@ -139,7 +139,7 @@ impl IndexBuilder {
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`. /// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub fn create_from_tempdir(self) -> crate::Result<Index> { pub fn create_from_tempdir(self) -> crate::Result<Index> {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?); let mmap_directory = MmapDirectory::create_from_tempdir()?;
self.create(mmap_directory) self.create(mmap_directory)
} }
fn get_expect_schema(&self) -> crate::Result<Schema> { fn get_expect_schema(&self) -> crate::Result<Schema> {
@@ -149,9 +149,8 @@ impl IndexBuilder {
.ok_or(TantivyError::IndexBuilderMissingArgument("schema")) .ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
} }
/// Opens or creates a new index in the provided directory /// Opens or creates a new index in the provided directory
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> { pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
let dir = dir.into(); if !Index::exists(&dir)? {
if !Index::exists(&*dir)? {
return self.create(dir); return self.create(dir);
} }
let index = Index::open(dir)?; let index = Index::open(dir)?;
@@ -166,8 +165,7 @@ impl IndexBuilder {
/// Creates a new index given an implementation of the trait `Directory`. /// Creates a new index given an implementation of the trait `Directory`.
/// ///
/// If a directory previously existed, it will be erased. /// If a directory previously existed, it will be erased.
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> { fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
let dir = dir.into();
let directory = ManagedDirectory::wrap(dir)?; let directory = ManagedDirectory::wrap(dir)?;
save_new_metas( save_new_metas(
self.get_expect_schema()?, self.get_expect_schema()?,
@@ -200,7 +198,7 @@ impl Index {
/// Examines the directory to see if it contains an index. /// Examines the directory to see if it contains an index.
/// ///
/// Effectively, it only checks for the presence of the `meta.json` file. /// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> { pub fn exists<Dir: Directory>(dir: &Dir) -> Result<bool, OpenReadError> {
dir.exists(&META_FILEPATH) dir.exists(&META_FILEPATH)
} }
@@ -217,7 +215,7 @@ impl Index {
/// Replace the default single thread search executor pool /// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads. /// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> { pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?); self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")?);
Ok(()) Ok(())
} }
@@ -252,11 +250,7 @@ impl Index {
} }
/// Opens or creates a new index in the provided directory /// Opens or creates a new index in the provided directory
pub fn open_or_create<T: Into<Box<dyn Directory>>>( pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
dir: T,
schema: Schema,
) -> crate::Result<Index> {
let dir = dir.into();
IndexBuilder::new().schema(schema).open_or_create(dir) IndexBuilder::new().schema(schema).open_or_create(dir)
} }
@@ -276,12 +270,11 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`. /// Creates a new index given an implementation of the trait `Directory`.
/// ///
/// If a directory previously existed, it will be erased. /// If a directory previously existed, it will be erased.
pub fn create<T: Into<Box<dyn Directory>>>( pub fn create<Dir: Directory>(
dir: T, dir: Dir,
schema: Schema, schema: Schema,
settings: IndexSettings, settings: IndexSettings,
) -> crate::Result<Index> { ) -> crate::Result<Index> {
let dir: Box<dyn Directory> = dir.into();
let mut builder = IndexBuilder::new().schema(schema); let mut builder = IndexBuilder::new().schema(schema);
builder = builder.settings(settings); builder = builder.settings(settings);
builder.create(dir) builder.create(dir)
@@ -372,8 +365,7 @@ impl Index {
} }
/// Open the index using the provided directory /// Open the index using the provided directory
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> { pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
let directory = directory.into();
let directory = ManagedDirectory::wrap(directory)?; let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default(); let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?; let metas = load_metas(&directory, &inventory)?;
@@ -403,7 +395,9 @@ impl Index {
/// ///
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`. /// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument` ///
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads( pub fn writer_with_num_threads(
&self, &self,
num_threads: usize, num_threads: usize,
@@ -445,13 +439,14 @@ impl Index {
/// Creates a multithreaded writer /// Creates a multithreaded writer
/// ///
/// Tantivy will automatically define the number of threads to use, but /// Tantivy will automatically define the number of threads to use, but
/// no more than 8 threads. /// no more than [`MAX_NUM_THREAD`] threads.
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split /// `overall_heap_size_in_bytes` is the total target memory usage that will be split
/// between a given number of threads. /// between a given number of threads.
/// ///
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument` /// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> { pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD); let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
@@ -582,15 +577,15 @@ mod tests {
#[test] #[test]
fn test_index_exists() { fn test_index_exists() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create()); let directory = RamDirectory::create();
assert!(!Index::exists(directory.as_ref()).unwrap()); assert!(!Index::exists(&directory).unwrap());
assert!(Index::create( assert!(Index::create(
directory.clone(), directory.clone(),
throw_away_schema(), throw_away_schema(),
IndexSettings::default() IndexSettings::default()
) )
.is_ok()); .is_ok());
assert!(Index::exists(directory.as_ref()).unwrap()); assert!(Index::exists(&directory).unwrap());
} }
#[test] #[test]
@@ -603,27 +598,27 @@ mod tests {
#[test] #[test]
fn open_or_create_should_open() { fn open_or_create_should_open() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create()); let directory = RamDirectory::create();
assert!(Index::create( assert!(Index::create(
directory.clone(), directory.clone(),
throw_away_schema(), throw_away_schema(),
IndexSettings::default() IndexSettings::default()
) )
.is_ok()); .is_ok());
assert!(Index::exists(directory.as_ref()).unwrap()); assert!(Index::exists(&directory).unwrap());
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok()); assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
} }
#[test] #[test]
fn create_should_wipeoff_existing() { fn create_should_wipeoff_existing() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create()); let directory = RamDirectory::create();
assert!(Index::create( assert!(Index::create(
directory.clone(), directory.clone(),
throw_away_schema(), throw_away_schema(),
IndexSettings::default() IndexSettings::default()
) )
.is_ok()); .is_ok());
assert!(Index::exists(directory.as_ref()).unwrap()); assert!(Index::exists(&directory).unwrap());
assert!(Index::create( assert!(Index::create(
directory, directory,
Schema::builder().build(), Schema::builder().build(),
@@ -657,7 +652,7 @@ mod tests {
} }
#[test] #[test]
fn test_index_on_commit_reload_policy() -> crate::Result<()> { fn test_index_on_commit_reload_policy() {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -667,7 +662,7 @@ mod tests {
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &index, &reader) test_index_on_commit_reload_policy_aux(field, &index, &reader);
} }
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
@@ -679,7 +674,7 @@ mod tests {
use tempfile::TempDir; use tempfile::TempDir;
#[test] #[test]
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> { fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new().unwrap(); let tempdir = TempDir::new().unwrap();
@@ -691,7 +686,7 @@ mod tests {
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &index, &reader) test_index_on_commit_reload_policy_aux(field, &index, &reader);
} }
#[test] #[test]
@@ -706,7 +701,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()?;
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?; writer.add_document(doc!(field=>1u64));
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = index.directory_mut().watch(WatchCallback::new(move || { let _handle = index.directory_mut().watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
@@ -720,7 +715,7 @@ mod tests {
} }
#[test] #[test]
fn test_index_on_commit_reload_policy_different_directories() -> crate::Result<()> { fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new().unwrap(); let tempdir = TempDir::new().unwrap();
@@ -733,14 +728,10 @@ mod tests {
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &write_index, &reader) test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux( fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
field: Field,
index: &Index,
reader: &IndexReader,
) -> crate::Result<()> {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index let _watch_handle = reader_index
@@ -748,9 +739,9 @@ mod tests {
.watch(WatchCallback::new(move || { .watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
let mut writer = index.writer_for_tests()?; let mut writer = index.writer_for_tests().unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?; writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
// We need a loop here because it is possible for notify to send more than // We need a loop here because it is possible for notify to send more than
// one modify event. It was observed on CI on MacOS. // one modify event. It was observed on CI on MacOS.
@@ -760,7 +751,7 @@ mod tests {
break; break;
} }
} }
writer.add_document(doc!(field=>2u64))?; writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
// ... Same as above // ... Same as above
loop { loop {
@@ -769,37 +760,37 @@ mod tests {
break; break;
} }
} }
Ok(())
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows
// prevent deleting files that are MMapped. // prevent deleting files that are MMapped.
#[cfg(not(target_os = "windows"))] #[cfg(not(target_os = "windows"))]
#[test] #[test]
fn garbage_collect_works_as_intended() -> crate::Result<()> { fn garbage_collect_works_as_intended() {
let directory = RamDirectory::create(); let directory = RamDirectory::create();
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?; let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap(); let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
for i in 0u64..8_000u64 { for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i))?; writer.add_document(doc!(field => i));
} }
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = directory.watch(WatchCallback::new(move || { let _handle = directory.watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
writer.commit()?; writer.commit().unwrap();
let mem_right_after_commit = directory.total_mem_usage(); let mem_right_after_commit = directory.total_mem_usage();
assert!(receiver.recv().is_ok()); assert!(receiver.recv().is_ok());
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 8_000); assert_eq!(reader.searcher().num_docs(), 8_000);
writer.wait_merging_threads()?; writer.wait_merging_threads().unwrap();
let mem_right_after_merge_finished = directory.total_mem_usage(); let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap(); reader.reload().unwrap();
@@ -811,6 +802,5 @@ mod tests {
mem_right_after_merge_finished, mem_right_after_merge_finished,
mem_right_after_commit mem_right_after_commit
); );
Ok(())
} }
} }

View File

@@ -2,7 +2,7 @@ use super::SegmentComponent;
use crate::schema::Schema; use crate::schema::Schema;
use crate::Opstamp; use crate::Opstamp;
use crate::{core::SegmentId, store::Compressor}; use crate::{core::SegmentId, store::Compressor};
use crate::{Inventory, TrackedObject}; use census::{Inventory, TrackedObject};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::path::PathBuf; use std::path::PathBuf;
use std::{collections::HashSet, sync::atomic::AtomicBool}; use std::{collections::HashSet, sync::atomic::AtomicBool};
@@ -189,10 +189,6 @@ impl SegmentMeta {
#[doc(hidden)] #[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta { pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
assert!(
num_deleted_docs <= self.max_doc(),
"There cannot be more deleted docs than there are docs."
);
let delete_meta = DeleteMeta { let delete_meta = DeleteMeta {
num_deleted_docs, num_deleted_docs,
opstamp, opstamp,
@@ -398,7 +394,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!( assert_eq!(
json, json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"# r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
); );
} }
} }

View File

@@ -14,7 +14,7 @@ pub use self::index_meta::{
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory, IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
}; };
pub use self::inverted_index_reader::InvertedIndexReader; pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::{Searcher, SearcherGeneration}; pub use self::searcher::Searcher;
pub use self::segment::Segment; pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent; pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId; pub use self::segment_id::SegmentId;

View File

@@ -1,5 +1,6 @@
use crate::collector::Collector; use crate::collector::Collector;
use crate::core::Executor; use crate::core::Executor;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::query::Query; use crate::query::Query;
use crate::schema::Document; use crate::schema::Document;
@@ -9,62 +10,9 @@ use crate::space_usage::SearcherSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::DocAddress; use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::Opstamp;
use crate::SegmentId;
use crate::TrackedObject;
use std::collections::BTreeMap;
use std::{fmt, io}; use std::{fmt, io};
/// Identifies the searcher generation accessed by a [Searcher].
///
/// While this might seem redundant, a [SearcherGeneration] contains
/// both a `generation_id` AND a list of `(SegmentId, DeleteOpstamp)`.
///
/// This is on purpose. This object is used by the `Warmer` API.
/// Having both information makes it possible to identify which
/// artifact should be refreshed or garbage collected.
///
/// Depending on the use case, `Warmer`'s implementers can decide to
/// produce artifacts per:
/// - `generation_id` (e.g. some searcher level aggregates)
/// - `(segment_id, delete_opstamp)` (e.g. segment level aggregates)
/// - `segment_id` (e.g. for immutable document level information)
/// - `(generation_id, segment_id)` (e.g. for consistent dynamic column)
/// - ...
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct SearcherGeneration {
segments: BTreeMap<SegmentId, Option<Opstamp>>,
generation_id: u64,
}
impl SearcherGeneration {
pub(crate) fn from_segment_readers(
segment_readers: &[SegmentReader],
generation_id: u64,
) -> Self {
let mut segment_id_to_del_opstamp = BTreeMap::new();
for segment_reader in segment_readers {
segment_id_to_del_opstamp
.insert(segment_reader.segment_id(), segment_reader.delete_opstamp());
}
Self {
segments: segment_id_to_del_opstamp,
generation_id,
}
}
/// Returns the searcher generation id.
pub fn generation_id(&self) -> u64 {
self.generation_id
}
/// Return a `(SegmentId -> DeleteOpstamp)` mapping.
pub fn segments(&self) -> &BTreeMap<SegmentId, Option<Opstamp>> {
&self.segments
}
}
/// Holds a list of `SegmentReader`s ready for search. /// Holds a list of `SegmentReader`s ready for search.
/// ///
/// It guarantees that the `Segment` will not be removed before /// It guarantees that the `Segment` will not be removed before
@@ -75,7 +23,6 @@ pub struct Searcher {
index: Index, index: Index,
segment_readers: Vec<SegmentReader>, segment_readers: Vec<SegmentReader>,
store_readers: Vec<StoreReader>, store_readers: Vec<StoreReader>,
generation: TrackedObject<SearcherGeneration>,
} }
impl Searcher { impl Searcher {
@@ -84,7 +31,6 @@ impl Searcher {
schema: Schema, schema: Schema,
index: Index, index: Index,
segment_readers: Vec<SegmentReader>, segment_readers: Vec<SegmentReader>,
generation: TrackedObject<SearcherGeneration>,
) -> io::Result<Searcher> { ) -> io::Result<Searcher> {
let store_readers: Vec<StoreReader> = segment_readers let store_readers: Vec<StoreReader> = segment_readers
.iter() .iter()
@@ -95,7 +41,6 @@ impl Searcher {
index, index,
segment_readers, segment_readers,
store_readers, store_readers,
generation,
}) })
} }
@@ -104,11 +49,6 @@ impl Searcher {
&self.index &self.index
} }
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
pub fn generation(&self) -> &SearcherGeneration {
self.generation.as_ref()
}
/// Fetches a document from tantivy's store given a `DocAddress`. /// Fetches a document from tantivy's store given a `DocAddress`.
/// ///
/// The searcher uses the segment ordinal to route the /// The searcher uses the segment ordinal to route the
@@ -148,7 +88,7 @@ impl Searcher {
&self.segment_readers &self.segment_readers
} }
/// Returns the segment_reader associated with the given segment_ord /// Returns the segment_reader associated with the given segment_ordinal
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader { pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize] &self.segment_readers[segment_ord as usize]
} }

View File

@@ -5,7 +5,6 @@ use crate::core::SegmentId;
use crate::directory::CompositeFile; use crate::directory::CompositeFile;
use crate::directory::FileSlice; use crate::directory::FileSlice;
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::intersect_alive_bitsets;
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
@@ -17,7 +16,6 @@ use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
use crate::DocId; use crate::DocId;
use crate::Opstamp;
use fail::fail_point; use fail::fail_point;
use std::fmt; use std::fmt;
use std::sync::Arc; use std::sync::Arc;
@@ -39,8 +37,6 @@ pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>, inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId, segment_id: SegmentId,
delete_opstamp: Option<Opstamp>,
max_doc: DocId, max_doc: DocId,
num_docs: DocId, num_docs: DocId,
@@ -103,7 +99,7 @@ impl SegmentReader {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() { match field_entry.field_type() {
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
let term_ords_reader = self.fast_fields().u64s(field)?; let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict = self let termdict = self
.termdict_composite .termdict_composite
@@ -130,17 +126,13 @@ impl SegmentReader {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
"Field norm not found for field {:?}. Was the field set to record norm during indexing?", "Field norm not found for field {:?}. Was it marked as indexed during indexing?",
field_name field_name
); );
crate::TantivyError::SchemaError(err_msg) crate::TantivyError::SchemaError(err_msg)
}) })
} }
pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
&self.fieldnorm_readers
}
/// Accessor to the segment's `StoreReader`. /// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> io::Result<StoreReader> { pub fn get_store_reader(&self) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone()) StoreReader::open(self.store_file.clone())
@@ -148,14 +140,6 @@ impl SegmentReader {
/// Open a new segment for reading. /// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> { pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
Self::open_with_custom_alive_set(segment, None)
}
/// Open a new segment for reading.
pub fn open_with_custom_alive_set(
segment: &Segment,
custom_bitset: Option<AliveBitSet>,
) -> crate::Result<SegmentReader> {
let termdict_file = segment.open_read(SegmentComponent::Terms)?; let termdict_file = segment.open_read(SegmentComponent::Terms)?;
let termdict_composite = CompositeFile::open(&termdict_file)?; let termdict_composite = CompositeFile::open(&termdict_file)?;
@@ -180,35 +164,27 @@ impl SegmentReader {
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?; let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers = let fast_field_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite)); Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let original_bitset = if segment.meta().has_deletes() { let alive_bitset_opt = if segment.meta().has_deletes() {
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?; let alive_bitset_bytes = segment.open_read(SegmentComponent::Delete)?.read_bytes()?;
let delete_data = delete_file_slice.read_bytes()?; let alive_bitset = AliveBitSet::open(alive_bitset_bytes);
Some(AliveBitSet::open(delete_data)) Some(alive_bitset)
} else { } else {
None None
}; };
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
let max_doc = segment.meta().max_doc();
let num_docs = alive_bitset_opt
.as_ref()
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
.unwrap_or(max_doc);
Ok(SegmentReader { Ok(SegmentReader {
inv_idx_reader_cache: Default::default(), inv_idx_reader_cache: Default::default(),
num_docs, max_doc: segment.meta().max_doc(),
max_doc, num_docs: segment.meta().num_docs(),
termdict_composite, termdict_composite,
postings_composite, postings_composite,
fast_fields_readers: fast_field_readers, fast_fields_readers: fast_field_readers,
fieldnorm_readers, fieldnorm_readers,
segment_id: segment.id(), segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(),
store_file, store_file,
alive_bitset_opt, alive_bitset_opt,
positions_composite, positions_composite,
@@ -294,11 +270,6 @@ impl SegmentReader {
self.segment_id self.segment_id
} }
/// Returns the delete opstamp
pub fn delete_opstamp(&self) -> Option<Opstamp> {
self.delete_opstamp
}
/// Returns the bitset representing /// Returns the bitset representing
/// the documents that have been deleted. /// the documents that have been deleted.
pub fn alive_bitset(&self) -> Option<&AliveBitSet> { pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
@@ -340,21 +311,6 @@ impl SegmentReader {
} }
} }
fn intersect_alive_bitset(
left_opt: Option<AliveBitSet>,
right_opt: Option<AliveBitSet>,
) -> Option<AliveBitSet> {
match (left_opt, right_opt) {
(Some(left), Some(right)) => {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
Some(intersect_alive_bitsets(left, right))
}
(Some(left), None) => Some(left),
(None, Some(right)) => Some(right),
(None, None) => None,
}
}
impl fmt::Debug for SegmentReader { impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id) write!(f, "SegmentReader({:?})", self.segment_id)
@@ -377,10 +333,10 @@ mod test {
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?; index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"))?; index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"))?; index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"))?; index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs // we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse")); index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap")); index_writer.delete_term(Term::from_field_text(name, "cap"));
@@ -403,10 +359,10 @@ mod test {
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?; index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"))?; index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"))?; index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"))?; index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs // we should now have one segment with two docs
index_writer.commit()?; index_writer.commit()?;
} }

View File

@@ -43,8 +43,10 @@ impl RetryPolicy {
} }
/// The `DirectoryLock` is an object that represents a file lock. /// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
/// ///
/// It is associated to a lock file, that gets deleted on `Drop.` /// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is released automatically on `Drop`.
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>); pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
struct DirectoryLockGuard { struct DirectoryLockGuard {
@@ -140,16 +142,10 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// Opens a writer for the *virtual file* associated with /// Opens a writer for the *virtual file* associated with
/// a Path. /// a Path.
/// ///
/// Right after this call, for the span of the execution of the program /// Right after this call, the file should be created
/// the file should be created and any subsequent call to `open_read` for the /// and any subsequent call to `open_read` for the
/// same path should return a `FileSlice`. /// same path should return a `FileSlice`.
/// ///
/// However, depending on the directory implementation,
/// it might be required to call `sync_directory` to ensure
/// that the file is durably created.
/// (The semantics here are the same when dealing with
/// a posix filesystem.)
///
/// Write operations may be aggressively buffered. /// Write operations may be aggressively buffered.
/// The client of this trait is responsible for calling flush /// The client of this trait is responsible for calling flush
/// to ensure that subsequent `read` operations /// to ensure that subsequent `read` operations
@@ -180,12 +176,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// The file may or may not previously exist. /// The file may or may not previously exist.
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>; fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Sync the directory.
///
/// This call is required to ensure that newly created files are
/// effectively stored durably.
fn sync_directory(&self) -> io::Result<()>;
/// Acquire a lock in the given directory. /// Acquire a lock in the given directory.
/// ///
/// The method is blocking or not depending on the `Lock` object. /// The method is blocking or not depending on the `Lock` object.
@@ -240,15 +230,3 @@ where
Box::new(self.clone()) Box::new(self.clone())
} }
} }
impl Clone for Box<dyn Directory> {
fn clone(&self) -> Self {
self.box_clone()
}
}
impl<T: Directory + 'static> From<T> for Box<dyn Directory> {
fn from(t: T) -> Self {
Box::new(t)
}
}

View File

@@ -7,8 +7,8 @@ use std::path::PathBuf;
/// [`LockParams`](./enum.LockParams.html). /// [`LockParams`](./enum.LockParams.html).
/// Tantivy itself uses only two locks but client application /// Tantivy itself uses only two locks but client application
/// can use the directory facility to define their own locks. /// can use the directory facility to define their own locks.
/// - [INDEX_WRITER_LOCK] /// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
/// - [META_LOCK] /// - [META_LOCK](./struct.META_LOCK.html)
/// ///
/// Check out these locks documentation for more information. /// Check out these locks documentation for more information.
/// ///

View File

@@ -39,16 +39,6 @@ pub enum OpenDirectoryError {
}, },
} }
impl OpenDirectoryError {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self {
Self::IoError {
io_error,
directory_path,
}
}
}
/// Error that may occur when starting to write in a file /// Error that may occur when starting to write in a file
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum OpenWriteError { pub enum OpenWriteError {

View File

@@ -66,7 +66,6 @@ impl FileSlice {
/// Wraps a FileHandle. /// Wraps a FileHandle.
#[doc(hidden)] #[doc(hidden)]
#[must_use]
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self { pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
FileSlice { FileSlice {
data: Arc::from(file_handle), data: Arc::from(file_handle),

View File

@@ -43,16 +43,14 @@ impl FileWatcher {
thread::Builder::new() thread::Builder::new()
.name("thread-tantivy-meta-file-watcher".to_string()) .name("thread-tantivy-meta-file-watcher".to_string())
.spawn(move || { .spawn(move || {
let mut current_checksum_opt = None; let mut current_checksum = None;
while state.load(Ordering::SeqCst) == 1 { while state.load(Ordering::SeqCst) == 1 {
if let Ok(checksum) = FileWatcher::compute_checksum(&path) { if let Ok(checksum) = FileWatcher::compute_checksum(&path) {
let metafile_has_changed = current_checksum_opt // `None.unwrap_or_else(|| !checksum) != checksum` evaluates to `true`
.map(|current_checksum| current_checksum != checksum) if current_checksum.unwrap_or_else(|| !checksum) != checksum {
.unwrap_or(true);
if metafile_has_changed {
info!("Meta file {:?} was modified", path); info!("Meta file {:?} was modified", path);
current_checksum_opt = Some(checksum); current_checksum = Some(checksum);
futures::executor::block_on(callbacks.broadcast()); futures::executor::block_on(callbacks.broadcast());
} }
} }

View File

@@ -64,7 +64,7 @@ fn save_managed_paths(
impl ManagedDirectory { impl ManagedDirectory {
/// Wraps a directory as managed directory. /// Wraps a directory as managed directory.
pub fn wrap(directory: Box<dyn Directory>) -> crate::Result<ManagedDirectory> { pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) { match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => { Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data); let managed_files_json = String::from_utf8_lossy(&data);
@@ -76,14 +76,14 @@ impl ManagedDirectory {
) )
})?; })?;
Ok(ManagedDirectory { Ok(ManagedDirectory {
directory, directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation { meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files, managed_paths: managed_files,
})), })),
}) })
} }
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory { Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
directory, directory: Box::new(directory),
meta_informations: Arc::default(), meta_informations: Arc::default(),
}), }),
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()), io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
@@ -192,7 +192,6 @@ impl ManagedDirectory {
for delete_file in &deleted_files { for delete_file in &deleted_files {
managed_paths_write.remove(delete_file); managed_paths_write.remove(delete_file);
} }
self.directory.sync_directory()?;
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?; save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
} }
@@ -223,22 +222,9 @@ impl ManagedDirectory {
.write() .write()
.expect("Managed file lock poisoned"); .expect("Managed file lock poisoned");
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if !has_changed { if has_changed {
return Ok(()); save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
} }
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
// This is not the first file we add.
// Therefore, we are sure that `.managed.json` has been already
// properly created and we do not need to sync its parent directory.
//
// (It might seem like a nicer solution to create the managed_json on the
// creation of the ManagedDirectory instance but it would actually
// prevent the use of read-only directories..)
let managed_file_definitely_already_exists = meta_wlock.managed_paths.len() > 1;
if managed_file_definitely_already_exists {
return Ok(());
}
self.directory.sync_directory()?;
Ok(()) Ok(())
} }
@@ -324,11 +310,6 @@ impl Directory for ManagedDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.directory.watch(watch_callback) self.directory.watch(watch_callback)
} }
fn sync_directory(&self) -> io::Result<()> {
self.directory.sync_directory()?;
Ok(())
}
} }
impl Clone for ManagedDirectory { impl Clone for ManagedDirectory {
@@ -359,7 +340,7 @@ mod tests_mmap_specific {
let test_path2: &'static Path = Path::new("some_path_for_test_2"); let test_path2: &'static Path = Path::new("some_path_for_test_2");
{ {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let write_file = managed_directory.open_write(test_path1).unwrap(); let write_file = managed_directory.open_write(test_path1).unwrap();
write_file.terminate().unwrap(); write_file.terminate().unwrap();
managed_directory managed_directory
@@ -374,7 +355,7 @@ mod tests_mmap_specific {
} }
{ {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
assert!(managed_directory.exists(test_path1).unwrap()); assert!(managed_directory.exists(test_path1).unwrap());
assert!(!managed_directory.exists(test_path2).unwrap()); assert!(!managed_directory.exists(test_path2).unwrap());
let living_files: HashSet<PathBuf> = HashSet::new(); let living_files: HashSet<PathBuf> = HashSet::new();
@@ -393,7 +374,7 @@ mod tests_mmap_specific {
let living_files = HashSet::new(); let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut write = managed_directory.open_write(test_path1).unwrap(); let mut write = managed_directory.open_write(test_path1).unwrap();
write.write_all(&[0u8, 1u8]).unwrap(); write.write_all(&[0u8, 1u8]).unwrap();
write.terminate().unwrap(); write.terminate().unwrap();

View File

@@ -74,12 +74,20 @@ pub struct CacheInfo {
pub mmapped: Vec<PathBuf>, pub mmapped: Vec<PathBuf>,
} }
#[derive(Default)]
struct MmapCache { struct MmapCache {
counters: CacheCounters, counters: CacheCounters,
cache: HashMap<PathBuf, WeakArcBytes>, cache: HashMap<PathBuf, WeakArcBytes>,
} }
impl Default for MmapCache {
fn default() -> MmapCache {
MmapCache {
counters: CacheCounters::default(),
cache: HashMap::new(),
}
}
}
impl MmapCache { impl MmapCache {
fn get_info(&self) -> CacheInfo { fn get_info(&self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect(); let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
@@ -193,19 +201,16 @@ impl MmapDirectory {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> { pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref(); let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() { if !directory_path.exists() {
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from( Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
directory_path, directory_path,
))); )))
} } else if !directory_path.is_dir() {
let canonical_path: PathBuf = directory_path.canonicalize().map_err(|io_err| { Err(OpenDirectoryError::NotADirectory(PathBuf::from(
OpenDirectoryError::wrap_io_error(io_err, PathBuf::from(directory_path))
})?;
if !canonical_path.is_dir() {
return Err(OpenDirectoryError::NotADirectory(PathBuf::from(
directory_path, directory_path,
))); )))
} else {
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
} }
Ok(MmapDirectory::new(canonical_path, None))
} }
/// Joins a relative_path to the directory `root_path` /// Joins a relative_path to the directory `root_path`
@@ -214,6 +219,33 @@ impl MmapDirectory {
self.inner.root_path.join(relative_path) self.inner.root_path.join(relative_path)
} }
/// Sync the root directory.
/// In certain FS, this is required to persistently create
/// a file.
fn sync_directory(&self) -> Result<(), io::Error> {
let mut open_opts = OpenOptions::new();
// Linux needs read to be set, otherwise returns EINVAL
// write must not be set, or it fails with EISDIR
open_opts.read(true);
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
// and calling sync_all() only works if write access is requested.
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::um::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.inner.root_path)?;
fd.sync_all()?;
Ok(())
}
/// Returns some statistical information /// Returns some statistical information
/// about the Mmap cache. /// about the Mmap cache.
/// ///
@@ -264,7 +296,8 @@ impl Write for SafeFileWriter {
} }
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
Ok(()) self.0.flush()?;
self.0.sync_all()
} }
} }
@@ -276,9 +309,7 @@ impl Seek for SafeFileWriter {
impl TerminatingWrite for SafeFileWriter { impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.0.flush()?; self.flush()
self.0.sync_data()?;
Ok(())
} }
} }
@@ -308,7 +339,6 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?; let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
tempfile.write_all(content)?; tempfile.write_all(content)?;
tempfile.flush()?; tempfile.flush()?;
tempfile.as_file_mut().sync_data()?;
tempfile.into_temp_path().persist(path)?; tempfile.into_temp_path().persist(path)?;
Ok(()) Ok(())
} }
@@ -343,17 +373,22 @@ impl Directory for MmapDirectory {
/// removed before the file is deleted. /// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
fs::remove_file(&full_path).map_err(|e| { match fs::remove_file(&full_path) {
if e.kind() == io::ErrorKind::NotFound { Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
DeleteError::FileDoesNotExist(path.to_owned()) io_error: e,
} else { filepath: path.to_path_buf(),
DeleteError::IoError { }),
io_error: e, Err(e) => {
filepath: path.to_path_buf(), if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(DeleteError::IoError {
io_error: e,
filepath: path.to_path_buf(),
})
} }
} }
})?; }
Ok(())
} }
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> { fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
@@ -382,13 +417,10 @@ impl Directory for MmapDirectory {
file.flush() file.flush()
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?; .map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
// Note we actually do not sync the parent directory here. // Apparetntly, on some filesystem syncing the parent
// // directory is required.
// A newly created file, may, in some case, be created and even flushed to disk. self.sync_directory()
// and then lost... .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?;
//
// The file will only be durably written after we terminate AND
// sync_directory() is called.
let writer = SafeFileWriter::new(file); let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer))) Ok(BufWriter::new(Box::new(writer)))
@@ -418,7 +450,7 @@ impl Directory for MmapDirectory {
debug!("Atomic Write {:?}", path); debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
atomic_write(&full_path, content)?; atomic_write(&full_path, content)?;
Ok(()) self.sync_directory()
} }
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> { fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
@@ -444,30 +476,6 @@ impl Directory for MmapDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self.inner.watch(watch_callback)) Ok(self.inner.watch(watch_callback))
} }
fn sync_directory(&self) -> Result<(), io::Error> {
let mut open_opts = OpenOptions::new();
// Linux needs read to be set, otherwise returns EINVAL
// write must not be set, or it fails with EISDIR
open_opts.read(true);
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
// and calling sync_all() only works if write access is requested.
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::um::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.inner.root_path)?;
fd.sync_data()?;
Ok(())
}
} }
#[cfg(test)] #[cfg(test)]
@@ -574,8 +582,8 @@ mod tests {
} }
#[test] #[test]
fn test_mmap_released() -> crate::Result<()> { fn test_mmap_released() {
let mmap_directory = MmapDirectory::create_from_tempdir()?; let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let mut schema_builder: SchemaBuilder = Schema::builder(); let mut schema_builder: SchemaBuilder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -584,30 +592,31 @@ mod tests {
let index = let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap(); Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default(); let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3); log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));
for _num_commits in 0..10 { for _num_commits in 0..10 {
for _ in 0..10 { for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"abc"))?; index_writer.add_document(doc!(text_field=>"abc"));
} }
index_writer.commit()?; index_writer.commit().unwrap();
} }
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
for _ in 0..4 { for _ in 0..4 {
index_writer.add_document(doc!(text_field=>"abc"))?; index_writer.add_document(doc!(text_field=>"abc"));
index_writer.commit()?; index_writer.commit().unwrap();
reader.reload()?; reader.reload().unwrap();
} }
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads().unwrap();
reader.reload()?; reader.reload().unwrap();
let num_segments = reader.searcher().segment_readers().len(); let num_segments = reader.searcher().segment_readers().len();
assert!(num_segments <= 4); assert!(num_segments <= 4);
let num_components_except_deletes_and_tempstore = let num_components_except_deletes_and_tempstore =
@@ -618,6 +627,5 @@ mod tests {
); );
} }
assert!(mmap_directory.get_cache_info().mmapped.is_empty()); assert!(mmap_directory.get_cache_info().mmapped.is_empty());
Ok(())
} }
} }

View File

@@ -1,6 +1,6 @@
/*! /*!
WORM (Write Once Read Many) directory abstraction. WORM directory abstraction.
*/ */

View File

@@ -18,6 +18,13 @@ use super::FileHandle;
/// Writer associated with the `RamDirectory` /// Writer associated with the `RamDirectory`
/// ///
/// The Writer just writes a buffer. /// The Writer just writes a buffer.
///
/// # Panics
///
/// On drop, if the writer was left in a *dirty* state.
/// That is, if flush was not called after the last call
/// to write.
///
struct VecWriter { struct VecWriter {
path: PathBuf, path: PathBuf,
shared_directory: RamDirectory, shared_directory: RamDirectory,
@@ -39,7 +46,7 @@ impl VecWriter {
impl Drop for VecWriter { impl Drop for VecWriter {
fn drop(&mut self) { fn drop(&mut self) {
if !self.is_flushed { if !self.is_flushed {
warn!( panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.", "You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
self.path self.path
) )
@@ -214,8 +221,14 @@ impl Directory for RamDirectory {
} }
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
)));
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
self.fs.write().unwrap().write(path_buf, data); self.fs.write().unwrap().write(path_buf, data);
if path == *META_FILEPATH { if path == *META_FILEPATH {
let _ = self.fs.write().unwrap().watch_router.broadcast(); let _ = self.fs.write().unwrap().watch_router.broadcast();
} }
@@ -225,10 +238,6 @@ impl Directory for RamDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self.fs.write().unwrap().watch(watch_callback)) Ok(self.fs.write().unwrap().watch(watch_callback))
} }
fn sync_directory(&self) -> io::Result<()> {
Ok(())
}
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -118,6 +118,15 @@ mod ram_directory_tests {
} }
} }
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let test_path: &'static Path = Path::new("some_path_for_test");
let ram_directory = RamDirectory::create();
let mut write_file = ram_directory.open_write(test_path).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &dyn Directory) -> crate::Result<()> { fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test"); let test_path: &'static Path = Path::new("some_path_for_test");
let mut write_file = directory.open_write(test_path)?; let mut write_file = directory.open_write(test_path)?;

View File

@@ -1,8 +1,7 @@
use crate::space_usage::ByteCount; use crate::space_usage::ByteCount;
use crate::DocId; use crate::DocId;
use common::intersect_bitsets;
use common::BitSet; use common::BitSet;
use common::ReadOnlyBitSet; use common::ReadSerializedBitSet;
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use std::io; use std::io;
use std::io::Write; use std::io::Write;
@@ -21,19 +20,8 @@ pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io
#[derive(Clone)] #[derive(Clone)]
pub struct AliveBitSet { pub struct AliveBitSet {
num_alive_docs: usize, num_alive_docs: usize,
bitset: ReadOnlyBitSet, bitset: ReadSerializedBitSet,
} num_bytes: ByteCount,
/// Intersects two AliveBitSets in a new one.
/// The two bitsets need to have the same max_value.
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
let bitset = intersect_bitsets(left.bitset(), right.bitset());
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
} }
impl AliveBitSet { impl AliveBitSet {
@@ -50,15 +38,15 @@ impl AliveBitSet {
Self::open(alive_bitset_bytes) Self::open(alive_bitset_bytes)
} }
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
let readonly_bitset = ReadOnlyBitSet::from(bitset);
AliveBitSet::from(readonly_bitset)
}
/// Opens a delete bitset given its file. /// Opens a delete bitset given its file.
pub fn open(bytes: OwnedBytes) -> AliveBitSet { pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let bitset = ReadOnlyBitSet::open(bytes); let num_bytes = bytes.len();
AliveBitSet::from(bitset) let bitset = ReadSerializedBitSet::open(bytes);
AliveBitSet {
num_alive_docs: bitset.len(),
bitset,
num_bytes,
}
} }
/// Returns true iff the document is still "alive". In other words, if it has not been deleted. /// Returns true iff the document is still "alive". In other words, if it has not been deleted.
@@ -73,7 +61,7 @@ impl AliveBitSet {
!self.is_alive(doc) !self.is_alive(doc)
} }
/// Iterate over the alive doc_ids. /// Iterate over the alive docids.
#[inline] #[inline]
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ { pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
self.bitset.iter() self.bitset.iter()
@@ -81,7 +69,7 @@ impl AliveBitSet {
/// Get underlying bitset /// Get underlying bitset
#[inline] #[inline]
pub fn bitset(&self) -> &ReadOnlyBitSet { pub fn bitset(&self) -> &ReadSerializedBitSet {
&self.bitset &self.bitset
} }
@@ -92,17 +80,7 @@ impl AliveBitSet {
/// Summarize total space usage of this bitset. /// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount { pub fn space_usage(&self) -> ByteCount {
self.bitset().num_bytes() self.num_bytes
}
}
impl From<ReadOnlyBitSet> for AliveBitSet {
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
} }
} }

View File

@@ -18,11 +18,11 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]))?; index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(bytes_field=>vec![]))?; index_writer.add_document(doc!(bytes_field=>vec![]));
index_writer.add_document(doc!(bytes_field=>vec![255u8]))?; index_writer.add_document(doc!(bytes_field=>vec![255u8]));
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]))?; index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]))?; index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
@@ -47,7 +47,7 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
field => b"tantivy".as_ref(), field => b"tantivy".as_ref(),
field => b"lucene".as_ref() field => b"lucene".as_ref()
))?; ));
index_writer.commit()?; index_writer.commit()?;
Ok(index.reader()?.searcher()) Ok(index.reader()?.searcher())
} }

View File

@@ -84,18 +84,18 @@ impl FacetReader {
mod tests { mod tests {
use crate::Index; use crate::Index;
use crate::{ use crate::{
schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED}, schema::{Facet, FacetOptions, SchemaBuilder, Value, INDEXED, STORED},
DocAddress, Document, DocAddress, Document,
}; };
#[test] #[test]
fn test_facet_only_indexed() -> crate::Result<()> { fn test_facet_only_indexed() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher
@@ -106,19 +106,42 @@ mod tests {
facet_reader.facet_ords(0u32, &mut facet_ords); facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]); assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::facet); let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, None); assert_eq!(value, None);
Ok(()) Ok(())
} }
#[test]
fn test_facet_only_stored() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, Some("/a/b".to_string()));
Ok(())
}
#[test] #[test]
fn test_facet_stored_and_indexed() -> crate::Result<()> { fn test_facet_stored_and_indexed() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", STORED); let facet_field = schema_builder.add_facet_field("facet", STORED | INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher
@@ -129,20 +152,43 @@ mod tests {
facet_reader.facet_ords(0u32, &mut facet_ords); facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]); assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet); let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); assert_eq!(value, Some("/a/b".to_string()));
Ok(())
}
#[test]
fn test_facet_neither_stored_and_indexed() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, None);
Ok(()) Ok(())
} }
#[test] #[test]
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> { fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher
@@ -160,12 +206,12 @@ mod tests {
#[test] #[test]
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> { fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.add_document(Document::default())?; index_writer.add_document(Document::default());
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher let facet_reader = searcher

View File

@@ -23,7 +23,6 @@ values stored.
Read access performance is comparable to that of an array lookup. Read access performance is comparable to that of an array lookup.
*/ */
pub use self::alive_bitset::intersect_alive_bitsets;
pub use self::alive_bitset::write_alive_bitset; pub use self::alive_bitset::write_alive_bitset;
pub use self::alive_bitset::AliveBitSet; pub use self::alive_bitset::AliveBitSet;
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
@@ -110,7 +109,7 @@ impl FastValue for u64 {
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> { fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type { match *field_type {
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(), FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues), FieldType::HierarchicalFacet(_) => Some(Cardinality::MultiValues),
_ => None, _ => None,
} }
} }
@@ -497,18 +496,18 @@ mod tests {
} }
#[test] #[test]
fn test_merge_missing_date_fast_field() -> crate::Result<()> { fn test_merge_missing_date_fast_field() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?; index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let segment_ids: Vec<SegmentId> = reader let segment_ids: Vec<SegmentId> = reader
.searcher() .searcher()
.segment_readers() .segment_readers()
@@ -517,10 +516,10 @@ mod tests {
.collect(); .collect();
assert_eq!(segment_ids.len(), 2); assert_eq!(segment_ids.len(), 2);
let merge_future = index_writer.merge(&segment_ids[..]); let merge_future = index_writer.merge(&segment_ids[..]);
futures::executor::block_on(merge_future)?; let merge_res = futures::executor::block_on(merge_future);
reader.reload()?; assert!(merge_res.is_ok());
assert!(reader.reload().is_ok());
assert_eq!(reader.searcher().segment_readers().len(), 1); assert_eq!(reader.searcher().segment_readers().len(), 1);
Ok(())
} }
#[test] #[test]
@@ -529,7 +528,7 @@ mod tests {
} }
#[test] #[test]
fn test_datefastfield() -> crate::Result<()> { fn test_datefastfield() {
use crate::fastfield::FastValue; use crate::fastfield::FastValue;
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
@@ -539,22 +538,22 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()), date_field => crate::DateTime::from_u64(1i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()), multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(3i64.to_u64()) multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(4i64.to_u64()) date_field => crate::DateTime::from_u64(4i64.to_u64())
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()), multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(6i64.to_u64()) multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
))?; ));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
@@ -581,7 +580,6 @@ mod tests {
assert_eq!(dates[0].timestamp(), 5i64); assert_eq!(dates[0].timestamp(), 5i64);
assert_eq!(dates[1].timestamp(), 6i64); assert_eq!(dates[1].timestamp(), 6i64);
} }
Ok(())
} }
} }

View File

@@ -12,9 +12,9 @@ mod tests {
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::Cardinality; use crate::schema::Cardinality;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::FacetOptions;
use crate::schema::IntOptions; use crate::schema::IntOptions;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::INDEXED;
use crate::Document; use crate::Document;
use crate::Index; use crate::Index;
use crate::Term; use crate::Term;
@@ -23,10 +23,10 @@ mod tests {
use proptest::prop_oneof; use proptest::prop_oneof;
use proptest::proptest; use proptest::proptest;
use proptest::strategy::Strategy; use proptest::strategy::Strategy;
use test_log::test; use test_env_log::test;
#[test] #[test]
fn test_multivalued_u64() -> crate::Result<()> { fn test_multivalued_u64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field( let field = schema_builder.add_u64_field(
"multifield", "multifield",
@@ -34,17 +34,17 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64))?; index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64))?; index_writer.add_document(doc!(field=>4u64));
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64))?; index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut vals = Vec::new(); let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().u64s(field)?; let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
{ {
multi_value_reader.get_vals(2, &mut vals); multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]); assert_eq!(&vals, &[4u64]);
@@ -57,55 +57,56 @@ mod tests {
multi_value_reader.get_vals(1, &mut vals); multi_value_reader.get_vals(1, &mut vals);
assert!(vals.is_empty()); assert!(vals.is_empty());
} }
Ok(())
} }
#[test] #[test]
fn test_multivalued_date() -> crate::Result<()> { fn test_multivalued_date() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field( let date_field = schema_builder.add_date_field(
"multi_date_field", "multi_date_field",
IntOptions::default() IntOptions::default()
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_indexed() .set_indexed()
.set_fieldnorm()
.set_stored(), .set_stored(),
); );
let time_i = let time_i =
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored()); schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let first_time_stamp = chrono::Utc::now(); let first_time_stamp = chrono::Utc::now();
index_writer.add_document( index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64), doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
)?; );
index_writer.add_document(doc!(time_i=>0i64))?; index_writer.add_document(doc!(time_i=>0i64));
// add one second // add one second
index_writer.add_document( index_writer
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64), .add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
)?;
// add another second // add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2); let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?; index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
// add three seconds // add three seconds
index_writer.add_document( index_writer
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64), .add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
)?; assert!(index_writer.commit().is_ok());
index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 5); assert_eq!(reader.num_docs(), 5);
{ {
let parser = QueryParser::for_index(&index, vec![date_field]); let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?; let query = parser
let results = searcher.search(&query, &TopDocs::with_limit(5))?; .parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
for (_score, doc_address) in results { for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!( assert_eq!(
retrieved_doc retrieved_doc
.get_first(date_field) .get_first(date_field)
@@ -127,8 +128,12 @@ mod tests {
{ {
let parser = QueryParser::for_index(&index, vec![date_field]); let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?; let query = parser
let results = searcher.search(&query, &TopDocs::with_limit(5))?; .parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
@@ -160,8 +165,10 @@ mod tests {
(first_time_stamp + Duration::seconds(1)).to_rfc3339(), (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
(first_time_stamp + Duration::seconds(3)).to_rfc3339() (first_time_stamp + Duration::seconds(3)).to_rfc3339()
); );
let query = parser.parse_query(&range_q)?; let query = parser.parse_query(&range_q).expect("could not parse query");
let results = searcher.search(&query, &TopDocs::with_limit(5))?; let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 2); assert_eq!(results.len(), 2);
for (i, doc_pair) in results.iter().enumerate() { for (i, doc_pair) in results.iter().enumerate() {
@@ -189,16 +196,16 @@ mod tests {
retrieved_doc retrieved_doc
.get_first(time_i) .get_first(time_i)
.expect("cannot find value") .expect("cannot find value")
.i64_value(), .i64_value()
Some(time_i_val) .expect("value not of i64 type"),
time_i_val
); );
} }
} }
Ok(())
} }
#[test] #[test]
fn test_multivalued_i64() -> crate::Result<()> { fn test_multivalued_i64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field( let field = schema_builder.add_i64_field(
"multifield", "multifield",
@@ -206,14 +213,14 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64))?; index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64))?; index_writer.add_document(doc!(field=> -4i64));
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64))?; index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut vals = Vec::new(); let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap(); let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
@@ -225,10 +232,9 @@ mod tests {
assert!(vals.is_empty()); assert!(vals.is_empty());
multi_value_reader.get_vals(3, &mut vals); multi_value_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[-5i64, -20i64, 1i64]); assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
Ok(())
} }
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> { fn test_multivalued_no_panic(ops: &[IndexingOp]) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field( let field = schema_builder.add_u64_field(
"multifield", "multifield",
@@ -238,7 +244,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops { for &op in ops {
@@ -246,19 +252,19 @@ mod tests {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id } => {
match id % 3 { match id % 3 {
0 => { 0 => {
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
} }
1 => { 1 => {
let mut doc = Document::new(); let mut doc = Document::new();
for _ in 0..5001 { for _ in 0..5001 {
doc.add_u64(field, id as u64); doc.add_u64(field, id as u64);
} }
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
_ => { _ => {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_u64(field, id as u64); doc.add_u64(field, id as u64);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
}; };
} }
@@ -269,16 +275,18 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
IndexingOp::Merge => { IndexingOp::Merge => {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 { if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids))?; block_on(index_writer.merge(&segment_ids)).unwrap();
index_writer.segment_updater().wait_merging_thread()?; assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
} }
} }
} }
} }
index_writer.commit()?; assert!(index_writer.commit().is_ok());
// Merging the segments // Merging the segments
{ {
@@ -290,7 +298,6 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
} }
Ok(())
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
@@ -313,7 +320,7 @@ mod tests {
proptest! { proptest! {
#[test] #[test]
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) { fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_multivalued_no_panic(&ops[..]).is_ok()); test_multivalued_no_panic(&ops[..]);
} }
} }
@@ -328,22 +335,20 @@ mod tests {
Merge, Merge,
]; ];
assert!(test_multivalued_no_panic(&ops[..]).is_ok()); test_multivalued_no_panic(&ops[..]);
} }
#[test] #[test]
#[ignore] #[ignore]
fn test_many_facets() -> crate::Result<()> { fn test_many_facets() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_facet_field("facetfield", FacetOptions::default()); let field = schema_builder.add_facet_field("facetfield", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..100_000 { for i in 0..100_000 {
index_writer index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())))?;
} }
index_writer.commit()?; assert!(index_writer.commit().is_ok());
Ok(())
} }
} }

View File

@@ -91,25 +91,27 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
mod tests { mod tests {
use crate::core::Index; use crate::core::Index;
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema}; use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
#[test] #[test]
fn test_multifastfield_reader() -> crate::Result<()> { fn test_multifastfield_reader() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facets", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index
.writer_for_tests()
.expect("Failed to create index writer.");
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"), facet_field => Facet::from("/category/cat2"),
facet_field => Facet::from("/category/cat1"), facet_field => Facet::from("/category/cat1"),
))?; ));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")))?; index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")))?; index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
index_writer.commit()?; index_writer.commit().expect("Commit failed");
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field)?; let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let mut facet = Facet::root(); let mut facet = Facet::root();
{ {
@@ -143,11 +145,10 @@ mod tests {
facet_reader.facet_ords(2, &mut vals); facet_reader.facet_ords(2, &mut vals);
assert_eq!(&vals[..], &[4]); assert_eq!(&vals[..], &[4]);
} }
Ok(())
} }
#[test] #[test]
fn test_multifastfield_reader_min_max() -> crate::Result<()> { fn test_multifastfield_reader_min_max() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field_options = IntOptions::default() let field_options = IntOptions::default()
.set_indexed() .set_indexed()
@@ -162,16 +163,15 @@ mod tests {
item_field => 2i64, item_field => 2i64,
item_field => 3i64, item_field => 3i64,
item_field => -2i64, item_field => -2i64,
))?; ));
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64))?; index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
index_writer.add_document(doc!(item_field => 4i64))?; index_writer.add_document(doc!(item_field => 4i64));
index_writer.commit()?; index_writer.commit().expect("Commit failed");
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let field_reader = segment_reader.fast_fields().i64s(item_field)?; let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
assert_eq!(field_reader.min_value(), -2); assert_eq!(field_reader.min_value(), -2);
assert_eq!(field_reader.max_value(), 6); assert_eq!(field_reader.max_value(), 6);
Ok(())
} }
} }

View File

@@ -40,7 +40,7 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
FieldType::Date(options) => options FieldType::Date(options) => options
.get_fastfield_cardinality() .get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)), .map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)), FieldType::HierarchicalFacet(_) => Some((FastType::U64, Cardinality::MultiValues)),
_ => None, _ => None,
} }
} }

View File

@@ -54,7 +54,7 @@ impl FastFieldsWriter {
None => {} None => {}
} }
} }
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true); let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer); multi_values_writers.push(fast_field_writer);
} }

View File

@@ -26,137 +26,3 @@ pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter; pub use self::writer::FieldNormsWriter;
use self::code::{fieldnorm_to_id, id_to_fieldnorm}; use self::code::{fieldnorm_to_id, id_to_fieldnorm};
#[cfg(test)]
mod tests {
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::query::Query;
use crate::query::TermQuery;
use crate::schema::IndexRecordOption;
use crate::schema::TextFieldIndexing;
use crate::schema::TextOptions;
use crate::schema::TEXT;
use crate::Index;
use crate::Term;
use crate::TERMINATED;
use once_cell::sync::Lazy;
use std::path::Path;
use crate::schema::{Field, Schema, STORED};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("field", STORED);
schema_builder.add_text_field("txt_field", TEXT);
schema_builder.add_text_field(
"str_field",
TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false),
),
);
schema_builder.build()
});
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());
#[test]
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
}
#[test]
pub fn test_fieldnorm() -> crate::Result<()> {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer, None)?;
}
let file = directory.open_read(&path)?;
{
let fields_composite = CompositeFile::open(&file)?;
assert!(fields_composite.open_read(*FIELD).is_none());
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
}
Ok(())
}
#[test]
fn test_fieldnorm_disabled() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(false));
let text = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(text=>"hello"))?;
writer.add_document(doc!(text=>"hello hello hello"))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query = TermQuery::new(
Term::from_field_text(text, "hello"),
IndexRecordOption::WithFreqs,
);
let weight = query.weight(&*searcher, true)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), 1);
assert_eq!(scorer.doc(), 1);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_fieldnorm_enabled() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(true));
let text = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(text=>"hello"))?;
writer.add_document(doc!(text=>"hello hello hello"))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query = TermQuery::new(
Term::from_field_text(text, "hello"),
IndexRecordOption::WithFreqs,
);
let weight = query.weight(&*searcher, true)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), 1);
assert_eq!(scorer.doc(), 1);
assert!((scorer.score() - 0.15136132).abs() < 0.001f32);
assert_eq!(scorer.advance(), TERMINATED);
Ok(())
}
}

View File

@@ -4,7 +4,6 @@ use super::fieldnorm_to_id;
use super::FieldNormsSerializer; use super::FieldNormsSerializer;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::Schema; use crate::schema::Schema;
use std::cmp::Ordering;
use std::{io, iter}; use std::{io, iter};
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte /// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
@@ -13,7 +12,8 @@ use std::{io, iter};
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a /// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
/// byte per document per field. /// byte per document per field.
pub struct FieldNormsWriter { pub struct FieldNormsWriter {
fieldnorms_buffers: Vec<Option<Vec<u8>>>, fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>,
} }
impl FieldNormsWriter { impl FieldNormsWriter {
@@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema schema
.fields() .fields()
.filter_map(|(field, field_entry)| { .filter_map(|(field, field_entry)| {
if field_entry.is_indexed() && field_entry.has_fieldnorms() { if field_entry.is_indexed() {
Some(field) Some(field)
} else { } else {
None None
@@ -35,20 +35,25 @@ impl FieldNormsWriter {
/// Initialize with state for tracking the field norm fields /// Initialize with state for tracking the field norm fields
/// specified in the schema. /// specified in the schema.
pub fn for_schema(schema: &Schema) -> FieldNormsWriter { pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let mut fieldnorms_buffers: Vec<Option<Vec<u8>>> = iter::repeat_with(|| None) let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
.take(schema.num_fields()) let max_field = fields
.collect(); .iter()
for field in FieldNormsWriter::fields_with_fieldnorm(schema) { .map(Field::field_id)
fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000)); .max()
.map(|max_field_id| max_field_id as usize + 1)
.unwrap_or(0);
FieldNormsWriter {
fields,
fieldnorms_buffer: iter::repeat_with(Vec::new)
.take(max_field)
.collect::<Vec<_>>(),
} }
FieldNormsWriter { fieldnorms_buffers }
} }
/// The memory used inclusive childs /// The memory used inclusive childs
pub fn mem_usage(&self) -> usize { pub fn mem_usage(&self) -> usize {
self.fieldnorms_buffers self.fieldnorms_buffer
.iter() .iter()
.flatten()
.map(|buf| buf.capacity()) .map(|buf| buf.capacity())
.sum() .sum()
} }
@@ -57,10 +62,8 @@ impl FieldNormsWriter {
/// ///
/// Will extend with 0-bytes for documents that have not been seen. /// Will extend with 0-bytes for documents that have not been seen.
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) { pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() { for field in self.fields.iter() {
if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() { self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
fieldnorms_buffer.resize(max_doc as usize, 0u8);
}
} }
} }
@@ -73,23 +76,14 @@ impl FieldNormsWriter {
/// * field - the field being set /// * field - the field being set
/// * fieldnorm - the number of terms present in document `doc` in field `field` /// * fieldnorm - the number of terms present in document `doc` in field `field`
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) { pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
if let Some(fieldnorm_buffer) = self let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
.fieldnorms_buffers assert!(
.get_mut(field.field_id() as usize) fieldnorm_buffer.len() <= doc as usize,
.and_then(Option::as_mut) "Cannot register a given fieldnorm twice"
{ );
match fieldnorm_buffer.len().cmp(&(doc as usize)) { // we fill intermediary `DocId` as having a fieldnorm of 0.
Ordering::Less => { fieldnorm_buffer.resize(doc as usize + 1, 0u8);
// we fill intermediary `DocId` as having a fieldnorm of 0. fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
fieldnorm_buffer.resize(doc as usize, 0u8);
}
Ordering::Equal => {}
Ordering::Greater => {
panic!("Cannot register a given fieldnorm twice")
}
}
fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm));
}
} }
/// Serialize the seen fieldnorm values to the serializer for all fields. /// Serialize the seen fieldnorm values to the serializer for all fields.
@@ -98,18 +92,17 @@ impl FieldNormsWriter {
mut fieldnorms_serializer: FieldNormsSerializer, mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map( for &field in self.fields.iter() {
|(field_id, fieldnorms_buffer_opt)| { let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
(Field::from_field_id(field_id as u32), fieldnorms_buffer)
})
},
) {
if let Some(doc_id_map) = doc_id_map { if let Some(doc_id_map) = doc_id_map {
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer); let mut mapped_fieldnorm_values = vec![];
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?; mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
}
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
} else { } else {
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
} }
} }
fieldnorms_serializer.close()?; fieldnorms_serializer.close()?;

View File

@@ -49,7 +49,7 @@ fn test_functional_store() -> crate::Result<()> {
} }
for _ in 0..num_docs { for _ in 0..num_docs {
doc_set.push(doc_id); doc_set.push(doc_id);
index_writer.add_document(doc!(id_field=>doc_id))?; index_writer.add_document(doc!(id_field=>doc_id));
doc_id += 1; doc_id += 1;
} }
index_writer.commit()?; index_writer.commit()?;
@@ -124,7 +124,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
doc.add_u64(multiples_field, random_val * i); doc.add_u64(multiples_field, random_val * i);
} }
doc.add_text(text_field, get_text()); doc.add_text(text_field, get_text());
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
} }
Ok(()) Ok(())
@@ -201,7 +201,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
doc.add_u64(multiples_field, random_val * i); doc.add_u64(multiples_field, random_val * i);
} }
doc.add_text(text_field, get_text()); doc.add_text(text_field, get_text());
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
} }
Ok(()) Ok(())

View File

@@ -1,324 +0,0 @@
use common::BitSet;
use itertools::Itertools;
use crate::fastfield::AliveBitSet;
use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal};
/// DemuxMapping can be used to reorganize data from multiple segments.
///
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant.
/// It allows to reorganize documents as follows:
///
/// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with
/// the documents (simplified)
/// Seg 1 [TENANT_A, TENANT_B]
/// Seg 2 [TENANT_A, TENANT_B]
///
/// You may want to group your documents to
/// Seg 1 [TENANT_A, TENANT_A]
/// Seg 2 [TENANT_B, TENANT_B]
///
/// Demuxing is the tool for that.
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal].
#[derive(Debug, Default)]
pub struct DemuxMapping {
/// [index old segment ordinal] -> [index doc_id] = new segment ordinal
mapping: Vec<DocIdToSegmentOrdinal>,
}
/// DocIdToSegmentOrdinal maps from doc_id within a segment to the new segment ordinal for demuxing.
///
/// For every source segment there is a `DocIdToSegmentOrdinal` to distribute its doc_ids.
#[derive(Debug, Default)]
pub struct DocIdToSegmentOrdinal {
doc_id_index_to_segment_ord: Vec<SegmentOrdinal>,
}
impl DocIdToSegmentOrdinal {
/// Creates a new DocIdToSegmentOrdinal with size of num_doc_ids.
/// Initially all doc_ids point to segment ordinal 0 and need to be set
/// the via `set` method.
pub fn with_max_doc(max_doc: usize) -> Self {
DocIdToSegmentOrdinal {
doc_id_index_to_segment_ord: vec![0; max_doc],
}
}
/// Returns the number of documents in this mapping.
/// It should be equal to the `max_doc` of the segment it targets.
pub fn max_doc(&self) -> u32 {
self.doc_id_index_to_segment_ord.len() as u32
}
/// Associates a doc_id with an output `SegmentOrdinal`.
pub fn set(&mut self, doc_id: u32, segment_ord: SegmentOrdinal) {
self.doc_id_index_to_segment_ord[doc_id as usize] = segment_ord;
}
/// Iterates over the new SegmentOrdinal in the order of the doc_id.
pub fn iter(&self) -> impl Iterator<Item = SegmentOrdinal> + '_ {
self.doc_id_index_to_segment_ord.iter().cloned()
}
}
impl DemuxMapping {
/// Adds a DocIdToSegmentOrdinal. The order of the pus calls
/// defines the old segment ordinal. e.g. first push = ordinal 0.
pub fn add(&mut self, segment_mapping: DocIdToSegmentOrdinal) {
self.mapping.push(segment_mapping);
}
/// Returns the old number of segments.
pub fn get_old_num_segments(&self) -> usize {
self.mapping.len()
}
}
fn docs_for_segment_ord(
doc_id_to_segment_ord: &DocIdToSegmentOrdinal,
target_segment_ord: SegmentOrdinal,
) -> AliveBitSet {
let mut bitset = BitSet::with_max_value(doc_id_to_segment_ord.max_doc());
for doc_id in doc_id_to_segment_ord
.iter()
.enumerate()
.filter(|(_doc_id, new_segment_ord)| *new_segment_ord == target_segment_ord)
.map(|(doc_id, _)| doc_id)
{
// add document if segment ordinal = target segment ordinal
bitset.insert(doc_id as u32);
}
AliveBitSet::from_bitset(&bitset)
}
fn get_alive_bitsets(
demux_mapping: &DemuxMapping,
target_segment_ord: SegmentOrdinal,
) -> Vec<AliveBitSet> {
demux_mapping
.mapping
.iter()
.map(|doc_id_to_segment_ord| {
docs_for_segment_ord(doc_id_to_segment_ord, target_segment_ord)
})
.collect_vec()
}
/// Demux the segments according to `demux_mapping`. See `DemuxMapping`.
/// The number of output_directories need to match max new segment ordinal from `demux_mapping`.
///
/// The ordinal of `segments` need to match the ordinals provided in `demux_mapping`.
pub fn demux(
segments: &[Segment],
demux_mapping: &DemuxMapping,
target_settings: IndexSettings,
output_directories: Vec<Box<dyn Directory>>,
) -> crate::Result<Vec<Index>> {
let mut indices = vec![];
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
.into_iter()
.map(Some)
.collect_vec();
let index = merge_filtered_segments(
segments,
target_settings.clone(),
delete_bitsets,
output_directory,
)?;
indices.push(index);
}
Ok(indices)
}
#[cfg(test)]
mod tests {
use crate::{
collector::TopDocs,
directory::RamDirectory,
query::QueryParser,
schema::{Schema, TEXT},
DocAddress, Term,
};
use super::*;
#[test]
fn test_demux_map_to_deletebitset() {
let max_value = 2;
let mut demux_mapping = DemuxMapping::default();
//segment ordinal 0 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 0);
demux_mapping.add(doc_id_to_segment);
//segment ordinal 1 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 1);
demux_mapping.add(doc_id_to_segment);
{
let bit_sets_for_demuxing_to_segment_ord_0 = get_alive_bitsets(&demux_mapping, 0);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(0),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(1),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(0),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(1),
true
);
}
{
let bit_sets_for_demuxing_to_segment_ord_1 = get_alive_bitsets(&demux_mapping, 1);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(0),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(1),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(0),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(1),
false
);
}
}
#[test]
fn test_demux_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"texto1"))?;
index_writer.add_document(doc!(text_field=>"texto2"))?;
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"texto3"))?;
index_writer.add_document(doc!(text_field=>"texto4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
segments.extend(second_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let mut demux_mapping = DemuxMapping::default();
{
let max_value = 2;
//segment ordinal 0 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 0);
demux_mapping.add(doc_id_to_segment);
//segment ordinal 1 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 1);
demux_mapping.add(doc_id_to_segment);
}
assert_eq!(demux_mapping.get_old_num_segments(), 2);
let demuxed_indices = demux(
&segments,
&demux_mapping,
target_settings,
vec![
Box::new(RamDirectory::default()),
Box::new(RamDirectory::default()),
],
)?;
{
let index = &demuxed_indices[0];
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 1);
let searcher = index.reader().unwrap().searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("texto1"), vec![] as Vec<u32>);
assert_eq!(do_search("texto2"), vec![0]);
}
}
{
let index = &demuxed_indices[1];
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 3);
let searcher = index.reader().unwrap().searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("texto1"), vec![0]);
assert_eq!(do_search("texto2"), vec![] as Vec<u32>);
assert_eq!(do_search("texto3"), vec![1]);
assert_eq!(do_search("texto4"), vec![2]);
}
}
Ok(())
}
}

View File

@@ -11,12 +11,12 @@ use std::{cmp::Reverse, ops::Index};
/// Struct to provide mapping from new doc_id to old doc_id and segment. /// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct SegmentDocIdMapping { pub(crate) struct SegmentDocidMapping {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>, new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool, is_trivial: bool,
} }
impl SegmentDocIdMapping { impl SegmentDocidMapping {
pub(crate) fn new( pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>, new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool, is_trivial: bool,
@@ -40,14 +40,14 @@ impl SegmentDocIdMapping {
self.is_trivial self.is_trivial
} }
} }
impl Index<usize> for SegmentDocIdMapping { impl Index<usize> for SegmentDocidMapping {
type Output = (DocId, SegmentOrdinal); type Output = (DocId, SegmentOrdinal);
fn index(&self, idx: usize) -> &Self::Output { fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx] &self.new_doc_id_to_old_and_segment[idx]
} }
} }
impl IntoIterator for SegmentDocIdMapping { impl IntoIterator for SegmentDocidMapping {
type Item = (DocId, SegmentOrdinal); type Item = (DocId, SegmentOrdinal);
type IntoIter = std::vec::IntoIter<Self::Item>; type IntoIter = std::vec::IntoIter<Self::Item>;
@@ -63,24 +63,6 @@ pub struct DocIdMapping {
} }
impl DocIdMapping { impl DocIdMapping {
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
let max_doc = new_doc_id_to_old.len();
let old_max_doc = new_doc_id_to_old
.iter()
.cloned()
.max()
.map(|n| n + 1)
.unwrap_or(0);
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
for i in 0..max_doc {
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
}
DocIdMapping {
new_doc_id_to_old,
old_doc_id_to_new,
}
}
/// returns the new doc_id for the old doc_id /// returns the new doc_id for the old doc_id
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId { pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
self.old_doc_id_to_new[doc_id as usize] self.old_doc_id_to_new[doc_id as usize]
@@ -93,13 +75,6 @@ impl DocIdMapping {
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ { pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned() self.new_doc_id_to_old.iter().cloned()
} }
/// Remaps a given array to the new doc ids.
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
self.new_doc_id_to_old
.iter()
.map(|old_doc| els[*old_doc as usize])
.collect()
}
} }
pub(crate) fn expect_field_id_for_sort_field( pub(crate) fn expect_field_id_for_sort_field(
@@ -147,13 +122,23 @@ pub(crate) fn get_doc_id_mapping_from_field(
.into_iter() .into_iter()
.map(|el| el.0) .map(|el| el.0)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
// create old doc_id to new doc_id index (used in posting recorder)
let max_doc = new_doc_id_to_old.len();
let mut old_doc_id_to_new = vec![0; max_doc];
for i in 0..max_doc {
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
}
let doc_id_map = DocIdMapping {
new_doc_id_to_old,
old_doc_id_to_new,
};
Ok(doc_id_map)
} }
#[cfg(test)] #[cfg(test)]
mod tests_indexsorting { mod tests_indexsorting {
use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldReader;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::{collector::TopDocs, query::QueryParser, schema::*}; use crate::{collector::TopDocs, query::QueryParser, schema::*};
use crate::{schema::Schema, DocAddress}; use crate::{schema::Schema, DocAddress};
use crate::{Index, IndexSettings, IndexSortByField, Order}; use crate::{Index, IndexSettings, IndexSortByField, Order};
@@ -161,7 +146,7 @@ mod tests_indexsorting {
fn create_test_index( fn create_test_index(
index_settings: Option<IndexSettings>, index_settings: Option<IndexSettings>,
text_field_options: TextOptions, text_field_options: TextOptions,
) -> crate::Result<Index> { ) -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let my_text_field = schema_builder.add_text_field("text_field", text_field_options); let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
@@ -181,20 +166,19 @@ mod tests_indexsorting {
if let Some(settings) = index_settings { if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings); index_builder = index_builder.settings(settings);
} }
let index = index_builder.create_in_ram()?; let index = index_builder.create_in_ram().unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(my_number=>40_u64))?; index_writer.add_document(doc!(my_number=>40_u64));
index_writer.add_document( index_writer
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64), .add_document(doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64));
)?; index_writer.add_document(doc!(my_number=>100_u64));
index_writer.add_document(doc!(my_number=>100_u64))?;
index_writer.add_document( index_writer.add_document(
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"), doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
)?; );
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?; index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ));
index_writer.commit()?; index_writer.commit().unwrap();
Ok(index) index
} }
fn get_text_options() -> TextOptions { fn get_text_options() -> TextOptions {
TextOptions::default().set_indexing_options( TextOptions::default().set_indexing_options(
@@ -219,7 +203,7 @@ mod tests_indexsorting {
for option in options { for option in options {
//let options = get_text_options(); //let options = get_text_options();
// no index_sort // no index_sort
let index = create_test_index(None, option.clone())?; let index = create_test_index(None, option.clone());
let my_text_field = index.schema().get_field("text_field").unwrap(); let my_text_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -241,7 +225,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
option.clone(), option.clone(),
)?; );
let my_text_field = index.schema().get_field("text_field").unwrap(); let my_text_field = index.schema().get_field("text_field").unwrap();
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -273,7 +257,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
option.clone(), option.clone(),
)?; );
let my_string_field = index.schema().get_field("text_field").unwrap(); let my_string_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -303,7 +287,7 @@ mod tests_indexsorting {
#[test] #[test]
fn test_sort_index_get_documents() -> crate::Result<()> { fn test_sort_index_get_documents() -> crate::Result<()> {
// default baseline // default baseline
let index = create_test_index(None, get_text_options())?; let index = create_test_index(None, get_text_options());
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
@@ -332,7 +316,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
@@ -357,7 +341,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
@@ -372,7 +356,7 @@ mod tests_indexsorting {
#[test] #[test]
fn test_sort_index_test_string_field() -> crate::Result<()> { fn test_sort_index_test_string_field() -> crate::Result<()> {
let index = create_test_index(None, get_text_options())?; let index = create_test_index(None, get_text_options());
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -392,7 +376,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -423,7 +407,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -459,7 +443,7 @@ mod tests_indexsorting {
..Default::default() ..Default::default()
}), }),
get_text_options(), get_text_options(),
)?; );
assert_eq!( assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field, index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string() "my_number".to_string()
@@ -490,27 +474,4 @@ mod tests_indexsorting {
assert_eq!(vals, &[3]); assert_eq!(vals, &[3]);
Ok(()) Ok(())
} }
#[test]
fn test_doc_mapping() {
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
}
#[test]
fn test_doc_mapping_remap() {
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
assert_eq!(
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
&[2000, 8000, 3000]
);
}
} }

View File

@@ -14,7 +14,6 @@ use crate::error::TantivyError;
use crate::fastfield::write_alive_bitset; use crate::fastfield::write_alive_bitset;
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue}; use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping; use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
use crate::indexer::index_writer_status::IndexWriterStatus;
use crate::indexer::operation::DeleteOperation; use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper; use crate::indexer::stamper::Stamper;
use crate::indexer::MergePolicy; use crate::indexer::MergePolicy;
@@ -29,13 +28,16 @@ use crossbeam::channel;
use futures::executor::block_on; use futures::executor::block_on;
use futures::future::Future; use futures::future::Future;
use smallvec::smallvec; use smallvec::smallvec;
use smallvec::SmallVec;
use wasm_mt_pool::pool_exec;
use wasm_mt::prelude::*;
use std::mem;
use std::ops::Range; use std::ops::Range;
use std::sync::Arc; use std::sync::Arc;
use wasm_mt_pool::prelude::*;
use std::thread; use std::thread;
use std::thread::JoinHandle; use std::thread::JoinHandle;
use super::{AddBatch, AddBatchReceiver, AddBatchSender};
// Size of the margin for the heap. A segment is closed when the remaining memory // Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES. // in the heap goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: usize = 1_000_000; pub const MARGIN_IN_BYTES: usize = 1_000_000;
@@ -51,12 +53,15 @@ pub const MAX_NUM_THREAD: usize = 8;
// reaches `PIPELINE_MAX_SIZE_IN_DOCS` // reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
fn error_in_index_worker_thread(context: &str) -> TantivyError { // Group of operations.
TantivyError::ErrorInThread(format!( // Most of the time, users will send operation one-by-one, but it can be useful to
"{}. A worker thread encounterred an error (io::Error most likely) or panicked.", // send them as a small block to ensure that
context // - all docs in the operation will happen on the same segment and continuous doc_ids.
)) // - all operations in the group are committed at the same time, making the group
} // atomic.
type OperationGroup = SmallVec<[AddOperation; 4]>;
type OperationSender = channel::Sender<OperationGroup>;
type OperationReceiver = channel::Receiver<OperationGroup>;
/// `IndexWriter` is the user entry-point to add document to an index. /// `IndexWriter` is the user entry-point to add document to an index.
/// ///
@@ -73,10 +78,10 @@ pub struct IndexWriter {
heap_size_in_bytes_per_thread: usize, heap_size_in_bytes_per_thread: usize,
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>, workers_join_handle: Vec<JoinHandle<Result<JsValue, JsValue>>>,
index_writer_status: IndexWriterStatus, operation_receiver: OperationReceiver,
operation_sender: AddBatchSender, operation_sender: OperationSender,
segment_updater: SegmentUpdater, segment_updater: SegmentUpdater,
@@ -88,6 +93,8 @@ pub struct IndexWriter {
stamper: Stamper, stamper: Stamper,
committed_opstamp: Opstamp, committed_opstamp: Opstamp,
worker_pool: wasm_mt_pool::ThreadPool,
} }
fn compute_deleted_bitset( fn compute_deleted_bitset(
@@ -162,8 +169,15 @@ pub(crate) fn advance_deletes(
target_opstamp, target_opstamp,
)?; )?;
// TODO optimize
// It should be possible to do something smarter by manipulation bitsets directly
// to compute this union.
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() { if let Some(seg_alive_bitset) = segment_reader.alive_bitset() {
alive_bitset.intersect_update(seg_alive_bitset.bitset()); for doc in 0u32..max_doc {
if seg_alive_bitset.is_deleted(doc) {
alive_bitset.remove(doc);
}
}
} }
let num_alive_docs: u32 = alive_bitset.len() as u32; let num_alive_docs: u32 = alive_bitset.len() as u32;
@@ -183,10 +197,10 @@ pub(crate) fn advance_deletes(
fn index_documents( fn index_documents(
memory_budget: usize, memory_budget: usize,
segment: Segment, segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>, grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
) -> crate::Result<()> { ) -> crate::Result<bool> {
let schema = segment.schema(); let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
@@ -205,7 +219,7 @@ fn index_documents(
} }
if !segment_updater.is_alive() { if !segment_updater.is_alive() {
return Ok(()); return Ok(false);
} }
let max_doc = segment_writer.max_doc(); let max_doc = segment_writer.max_doc();
@@ -225,13 +239,13 @@ fn index_documents(
// update segment_updater inventory to remove tempstore // update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt); let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
block_on(segment_updater.schedule_add_segment(segment_entry))?; block_on(segment_updater.schedule_add_segment(segment_entry))?;
Ok(()) Ok(true)
} }
/// `doc_opstamps` is required to be non-empty. /// `doc_opstamps` is required to be non-empty.
fn apply_deletes( fn apply_deletes(
segment: &Segment, segment: &Segment,
delete_cursor: &mut DeleteCursor, mut delete_cursor: &mut DeleteCursor,
doc_opstamps: &[Opstamp], doc_opstamps: &[Opstamp],
) -> crate::Result<Option<BitSet>> { ) -> crate::Result<Option<BitSet>> {
if delete_cursor.get().is_none() { if delete_cursor.get().is_none() {
@@ -254,7 +268,7 @@ fn apply_deletes(
let may_have_deletes = compute_deleted_bitset( let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset, &mut deleted_bitset,
&segment_reader, &segment_reader,
delete_cursor, &mut delete_cursor,
&doc_to_opstamps, &doc_to_opstamps,
max_doc_opstamp, max_doc_opstamp,
)?; )?;
@@ -278,7 +292,8 @@ impl IndexWriter {
/// should work at the same time. /// should work at the same time.
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument` /// # Panics
/// If the heap size per thread is too small, panics.
pub(crate) fn new( pub(crate) fn new(
index: &Index, index: &Index,
num_threads: usize, num_threads: usize,
@@ -296,7 +311,7 @@ impl IndexWriter {
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX); let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
return Err(TantivyError::InvalidArgument(err_msg)); return Err(TantivyError::InvalidArgument(err_msg));
} }
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) = let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new(); let delete_queue = DeleteQueue::new();
@@ -308,13 +323,14 @@ impl IndexWriter {
let segment_updater = let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?; SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let worker_pool = block_on(wasm_mt_pool::ThreadPool::new(num_threads, crate::PKG_JS).and_init()).unwrap();
let mut index_writer = IndexWriter { let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock), _directory_lock: Some(directory_lock),
heap_size_in_bytes_per_thread, heap_size_in_bytes_per_thread,
index: index.clone(), index: index.clone(),
index_writer_status: IndexWriterStatus::from(document_receiver), operation_receiver: document_receiver,
operation_sender: document_sender, operation_sender: document_sender,
segment_updater, segment_updater,
@@ -328,6 +344,7 @@ impl IndexWriter {
stamper, stamper,
worker_id: 0, worker_id: 0,
worker_pool,
}; };
index_writer.start_workers()?; index_writer.start_workers()?;
Ok(index_writer) Ok(index_writer)
@@ -354,14 +371,16 @@ impl IndexWriter {
for join_handle in former_workers_handles { for join_handle in former_workers_handles {
join_handle join_handle
.join() .join()
.map_err(|_| error_in_index_worker_thread("Worker thread panicked."))? .expect("Indexing Worker thread panicked")
.map_err(|_| error_in_index_worker_thread("Worker thread failed."))?; .map_err(|_| {
TantivyError::ErrorInThread("Error in indexing worker thread.".into())
})?;
} }
let result = self let result = self
.segment_updater .segment_updater
.wait_merging_thread() .wait_merging_thread()
.map_err(|_| error_in_index_worker_thread("Failed to join merging thread.")); .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
if let Err(ref e) = result { if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e); error!("Some merging thread failed {:?}", e);
@@ -389,53 +408,45 @@ impl IndexWriter {
self.index.new_segment() self.index.new_segment()
} }
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
self.index_writer_status
.operation_receiver()
.ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string()))
}
/// Spawns a new worker thread for indexing. /// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline. /// The thread consumes documents from the pipeline.
fn add_indexing_worker(&mut self) -> crate::Result<()> { fn add_indexing_worker(&mut self) -> crate::Result<()> {
let document_receiver_clone = self.operation_receiver()?; let document_receiver_clone = self.operation_receiver.clone();
let index_writer_bomb = self.index_writer_status.create_bomb();
let mut segment_updater = self.segment_updater.clone(); let mut segment_updater = self.segment_updater.clone();
let mut delete_cursor = self.delete_queue.cursor(); let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.heap_size_in_bytes_per_thread; let mem_budget = self.heap_size_in_bytes_per_thread;
let index = self.index.clone(); let index = self.index.clone();
let join_handle: JoinHandle<crate::Result<()>> = thread::Builder::new() let join_handle: JoinHandle<crate::Result<_>> = pool_exec!(self.worker_pool,
.name(format!("thrd-tantivy-index{}", self.worker_id)) move || {
.spawn(move || {
loop { loop {
let mut document_iterator = document_receiver_clone let mut document_iterator =
.clone() document_receiver_clone.clone().into_iter().peekable();
.into_iter()
.filter(|batch| !batch.is_empty())
.peekable();
// The peeking here is to avoid creating a new segment's files // the peeking here is to avoid
// creating a new segment's files
// if no document are available. // if no document are available.
// //
// This is a valid guarantee as the peeked document now belongs to // this is a valid guarantee as the
// peeked document now belongs to
// our local iterator. // our local iterator.
if let Some(batch) = document_iterator.peek() { if let Some(operations) = document_iterator.peek() {
assert!(!batch.is_empty()); if let Some(first) = operations.first() {
delete_cursor.skip_to(batch[0].opstamp); delete_cursor.skip_to(first.opstamp);
} else {
return Ok(());
}
} else { } else {
// No more documents. // No more documents.
// It happens when there is a commit, or if the `IndexWriter` // Happens when there is a commit, or if the `IndexWriter`
// was dropped. // was dropped.
index_writer_bomb.defuse();
return Ok(()); return Ok(());
} }
let segment = index.new_segment();
index_documents( index_documents(
mem_budget, mem_budget,
index.new_segment(), segment,
&mut document_iterator, &mut document_iterator,
&mut segment_updater, &mut segment_updater,
delete_cursor.clone(), delete_cursor.clone(),
@@ -465,8 +476,10 @@ impl IndexWriter {
} }
/// Detects and removes the files that are not used by the index anymore. /// Detects and removes the files that are not used by the index anymore.
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> { pub fn garbage_collect_files(
self.segment_updater.schedule_garbage_collect().await &self,
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
self.segment_updater.schedule_garbage_collect()
} }
/// Deletes all documents from the index /// Deletes all documents from the index
@@ -489,7 +502,7 @@ impl IndexWriter {
/// let index = Index::create_in_ram(schema.clone()); /// let index = Index::create_in_ram(schema.clone());
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?; /// index_writer.add_document(doc!(title => "The modern Promotheus"));
/// index_writer.commit()?; /// index_writer.commit()?;
/// ///
/// let clear_res = index_writer.delete_all_documents().unwrap(); /// let clear_res = index_writer.delete_all_documents().unwrap();
@@ -533,11 +546,12 @@ impl IndexWriter {
/// when no documents are remaining. /// when no documents are remaining.
/// ///
/// Returns the former segment_ready channel. /// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) { #[allow(unused_must_use)]
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) = fn recreate_document_channel(&mut self) -> OperationReceiver {
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender; mem::replace(&mut self.operation_sender, document_sender);
self.index_writer_status = IndexWriterStatus::from(document_receiver); mem::replace(&mut self.operation_receiver, document_receiver)
} }
/// Rollback to the last commit /// Rollback to the last commit
@@ -553,7 +567,7 @@ impl IndexWriter {
// marks the segment updater as killed. From now on, all // marks the segment updater as killed. From now on, all
// segment updates will be ignored. // segment updates will be ignored.
self.segment_updater.kill(); self.segment_updater.kill();
let document_receiver_res = self.operation_receiver(); let document_receiver = self.operation_receiver.clone();
// take the directory lock to create a new index_writer. // take the directory lock to create a new index_writer.
let directory_lock = self let directory_lock = self
@@ -579,9 +593,7 @@ impl IndexWriter {
// //
// This will reach an end as the only document_sender // This will reach an end as the only document_sender
// was dropped with the index_writer. // was dropped with the index_writer.
if let Ok(document_receiver) = document_receiver_res { for _ in document_receiver {}
for _ in document_receiver {}
}
Ok(self.committed_opstamp) Ok(self.committed_opstamp)
} }
@@ -695,10 +707,14 @@ impl IndexWriter {
/// The opstamp is an increasing `u64` that can /// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own /// be used by the client to align commits with its own
/// document queue. /// document queue.
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> { pub fn add_document(&self, document: Document) -> Opstamp {
let opstamp = self.stamper.stamp(); let opstamp = self.stamper.stamp();
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?; let add_operation = AddOperation { opstamp, document };
Ok(opstamp) let send_result = self.operation_sender.send(smallvec![add_operation]);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
}
opstamp
} }
/// Gets a range of stamps from the stamper and "pops" the last stamp /// Gets a range of stamps from the stamper and "pops" the last stamp
@@ -711,7 +727,11 @@ impl IndexWriter {
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) { fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
let Range { start, end } = self.stamper.stamps(count + 1u64); let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1; let last_opstamp = end - 1;
(last_opstamp, start..last_opstamp) let stamps = Range {
start,
end: last_opstamp,
};
(last_opstamp, stamps)
} }
/// Runs a group of document operations ensuring that the operations are /// Runs a group of document operations ensuring that the operations are
@@ -730,20 +750,16 @@ impl IndexWriter {
/// Like adds and deletes (see `IndexWriter.add_document` and /// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be /// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`. /// visible to readers only after calling `commit()`.
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp> pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
where let count = user_operations.len() as u64;
I: IntoIterator<Item = UserOperation>,
I::IntoIter: ExactSizeIterator,
{
let user_operations_it = user_operations.into_iter();
let count = user_operations_it.len() as u64;
if count == 0 { if count == 0 {
return Ok(self.stamper.stamp()); return self.stamper.stamp();
} }
let (batch_opstamp, stamps) = self.get_batch_opstamps(count); let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
let mut adds = AddBatch::default(); let mut adds = OperationGroup::default();
for (user_op, opstamp) in user_operations_it.zip(stamps) {
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
match user_op { match user_op {
UserOperation::Delete(term) => { UserOperation::Delete(term) => {
let delete_operation = DeleteOperation { opstamp, term }; let delete_operation = DeleteOperation { opstamp, term };
@@ -755,16 +771,12 @@ impl IndexWriter {
} }
} }
} }
self.send_add_documents_batch(adds)?; let send_result = self.operation_sender.send(adds);
Ok(batch_opstamp) if let Err(e) = send_result {
} panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
};
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> { batch_opstamp
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(())
} else {
Err(error_in_index_worker_thread("An index writer was killed."))
}
} }
} }
@@ -798,7 +810,6 @@ mod tests {
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::Cardinality; use crate::schema::Cardinality;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::FacetOptions;
use crate::schema::IntOptions; use crate::schema::IntOptions;
use crate::schema::TextFieldIndexing; use crate::schema::TextFieldIndexing;
use crate::schema::TextOptions; use crate::schema::TextOptions;
@@ -831,7 +842,7 @@ mod tests {
UserOperation::Add(doc!(text_field=>"a")), UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")), UserOperation::Add(doc!(text_field=>"b")),
]; ];
let batch_opstamp1 = index_writer.run(operations).unwrap(); let batch_opstamp1 = index_writer.run(operations);
assert_eq!(batch_opstamp1, 2u64); assert_eq!(batch_opstamp1, 2u64);
} }
@@ -842,12 +853,8 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer.add_document(doc!(text_field => "hello1"));
.add_document(doc!(text_field => "hello1")) index_writer.add_document(doc!(text_field => "hello2"));
.unwrap();
index_writer
.add_document(doc!(text_field => "hello2"))
.unwrap();
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
@@ -904,7 +911,7 @@ mod tests {
UserOperation::Delete(b_term), UserOperation::Delete(b_term),
]; ];
index_writer.run(operations).unwrap(); index_writer.run(operations);
index_writer.commit().expect("failed to commit"); index_writer.commit().expect("failed to commit");
reader.reload().expect("failed to load searchers"); reader.reload().expect("failed to load searchers");
@@ -934,10 +941,10 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap(); let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![]; let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap(); let batch_opstamp1 = index_writer.run(operations1);
assert_eq!(batch_opstamp1, 0u64); assert_eq!(batch_opstamp1, 0u64);
let operations2 = vec![]; let operations2 = vec![];
let batch_opstamp2 = index_writer.run(operations2).unwrap(); let batch_opstamp2 = index_writer.run(operations2);
assert_eq!(batch_opstamp2, 1u64); assert_eq!(batch_opstamp2, 1u64);
} }
@@ -974,7 +981,7 @@ mod tests {
assert_eq!( assert_eq!(
format!("{:?}", index_writer.get_merge_policy()), format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \ "LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }" level_log_size: 0.75 }"
); );
let merge_policy = Box::new(NoMergePolicy::default()); let merge_policy = Box::new(NoMergePolicy::default());
index_writer.set_merge_policy(merge_policy); index_writer.set_merge_policy(merge_policy);
@@ -997,14 +1004,15 @@ mod tests {
} }
#[test] #[test]
fn test_commit_and_rollback() -> crate::Result<()> { fn test_commit_and_rollback() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s); let term = Term::from_field_text(text_field, s);
@@ -1013,127 +1021,136 @@ mod tests {
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer(3_000_000)?; let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback()?; index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64); assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
index_writer.add_document(doc!(text_field=>"b"))?; {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"b"));
index_writer.commit()?; index_writer.add_document(doc!(text_field=>"c"));
reader.reload()?; }
assert!(index_writer.commit().is_ok());
reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1); assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1); assert_eq!(num_docs_containing("c"), 1);
} }
reader.reload()?; reader.reload().unwrap();
reader.searcher(); reader.searcher();
Ok(())
} }
#[test] #[test]
fn test_with_merges() -> crate::Result<()> { fn test_with_merges() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
reader.searcher().doc_freq(&term_a).unwrap() reader.searcher().doc_freq(&term_a).unwrap()
}; };
// writing the segment {
let mut index_writer = index.writer(12_000_000).unwrap(); // writing the segment
// create 8 segments with 100 tiny docs let mut index_writer = index.writer(12_000_000).unwrap();
for _doc in 0..100 { // create 8 segments with 100 tiny docs
index_writer.add_document(doc!(text_field=>"a"))?; for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
}
index_writer.commit().expect("commit failed");
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8);
} }
index_writer.commit()?;
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?;
}
// this should create 8 segments and trigger a merge.
index_writer.commit()?;
index_writer.wait_merging_threads()?;
reader.reload()?;
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments()?.len() < 8);
Ok(())
} }
#[test] #[test]
fn test_prepare_with_commit_message() -> crate::Result<()> { fn test_prepare_with_commit_message() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer(12_000_000)?;
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
{
let mut prepared_commit = index_writer.prepare_commit()?;
prepared_commit.set_payload("first commit");
prepared_commit.commit()?;
}
{
let metas = index.load_metas()?;
assert_eq!(metas.payload.unwrap(), "first commit");
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
index_writer.commit()?;
{
let metas = index.load_metas()?;
assert!(metas.payload.is_none());
}
Ok(())
}
#[test]
fn test_prepare_but_rollback() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000)?; let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs // create 8 segments with 100 tiny docs
for _doc in 0..100 { for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?; index_writer.add_document(doc!(text_field => "a"));
} }
{ {
let mut prepared_commit = index_writer.prepare_commit()?; let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit"); prepared_commit.set_payload("first commit");
prepared_commit.abort()?; prepared_commit.commit().expect("commit failed");
} }
{ {
let metas = index.load_metas()?; let metas = index.load_metas().unwrap();
assert_eq!(metas.payload.unwrap(), "first commit");
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();
{
let metas = index.load_metas().unwrap();
assert!(metas.payload.is_none());
}
}
}
#[test]
fn test_prepare_but_rollback() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
prepared_commit.abort().expect("commit failed");
}
{
let metas = index.load_metas().unwrap();
assert!(metas.payload.is_none()); assert!(metas.payload.is_none());
} }
for _doc in 0..100 { for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "b"))?; index_writer.add_document(doc!(text_field => "b"));
} }
index_writer.commit()?; index_writer.commit().unwrap();
} }
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
index index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()? .try_into()
.unwrap()
.searcher() .searcher()
.doc_freq(&term_a) .doc_freq(&term_a)
.unwrap()
}; };
assert_eq!(num_docs_containing("a")?, 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b")?, 100); assert_eq!(num_docs_containing("b"), 100);
Ok(())
} }
#[test] #[test]
@@ -1154,7 +1171,7 @@ mod tests {
}; };
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap(); let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
let commit_tstamp = index_writer.commit().unwrap(); let commit_tstamp = index_writer.commit().unwrap();
assert!(commit_tstamp > add_tstamp); assert!(commit_tstamp > add_tstamp);
index_writer.delete_all_documents().unwrap(); index_writer.delete_all_documents().unwrap();
@@ -1171,7 +1188,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap(); let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
// commit documents - they are now available // commit documents - they are now available
let first_commit = index_writer.commit(); let first_commit = index_writer.commit();
@@ -1190,7 +1207,7 @@ mod tests {
// add new documents again // add new documents again
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b")).unwrap(); index_writer.add_document(doc!(text_field => "b"));
} }
// rollback to last commit, when index was empty // rollback to last commit, when index was empty
@@ -1224,7 +1241,7 @@ mod tests {
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
// add one simple doc // add one simple doc
index_writer.add_document(doc!(text_field => "a")).unwrap(); index_writer.add_document(doc!(text_field => "a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
@@ -1248,7 +1265,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// add one simple doc // add one simple doc
assert!(index_writer.add_document(doc!(text_field => "a")).is_ok()); index_writer.add_document(doc!(text_field => "a"));
let comm = index_writer.commit(); let comm = index_writer.commit();
assert!(comm.is_ok()); assert!(comm.is_ok());
let commit_tstamp = comm.unwrap(); let commit_tstamp = comm.unwrap();
@@ -1324,13 +1341,13 @@ mod tests {
// create and delete docs in same commit // create and delete docs in same commit
for id in 0u64..5u64 { for id in 0u64..5u64 {
index_writer.add_document(doc!(id_field => id))?; index_writer.add_document(doc!(id_field => id));
} }
for id in 2u64..4u64 { for id in 2u64..4u64 {
index_writer.delete_term(Term::from_field_u64(id_field, id)); index_writer.delete_term(Term::from_field_u64(id_field, id));
} }
for id in 5u64..10u64 { for id in 5u64..10u64 {
index_writer.add_document(doc!(id_field => id))?; index_writer.add_document(doc!(id_field => id));
} }
index_writer.commit()?; index_writer.commit()?;
index_reader.reload()?; index_reader.reload()?;
@@ -1358,24 +1375,15 @@ mod tests {
Merge, Merge,
} }
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> { fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![ prop_oneof![
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }), (0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }), (0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit), (0u64..2u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge), (0u64..1u64).prop_map(|_| IndexingOp::Merge),
] ]
} }
fn adding_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
10 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
}
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) { fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new(); let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new(); let mut deleted_ids = HashSet::new();
@@ -1420,7 +1428,7 @@ mod tests {
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_stored(), .set_stored(),
); );
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let settings = if sort_index { let settings = if sort_index {
IndexSettings { IndexSettings {
@@ -1442,14 +1450,12 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
let old_reader = index.reader()?;
for &op in ops { for &op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
index_writer index_writer
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM))?; .add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id)); index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1482,21 +1488,6 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
} }
old_reader.reload()?;
let old_searcher = old_reader.searcher();
let ids_old_searcher: HashSet<u64> = old_searcher
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get(doc))
})
.collect();
let ids: HashSet<u64> = searcher let ids: HashSet<u64> = searcher
.segment_readers() .segment_readers()
.iter() .iter()
@@ -1509,19 +1500,6 @@ mod tests {
.collect(); .collect();
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops); let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
let num_docs_expected = expected_ids_and_num_occurences
.iter()
.map(|(_, id_occurences)| *id_occurences as usize)
.sum::<usize>();
assert_eq!(searcher.num_docs() as usize, num_docs_expected);
assert_eq!(old_searcher.num_docs() as usize, num_docs_expected);
assert_eq!(
ids_old_searcher,
expected_ids_and_num_occurences
.keys()
.cloned()
.collect::<HashSet<_>>()
);
assert_eq!( assert_eq!(
ids, ids,
expected_ids_and_num_occurences expected_ids_and_num_occurences
@@ -1616,42 +1594,22 @@ mod tests {
} }
proptest! { proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
#[test] #[test]
fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok()); assert!(test_operation_strategy(&ops[..], true, false).is_ok());
} }
#[test] #[test]
fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok()); assert!(test_operation_strategy(&ops[..], false, false).is_ok());
} }
#[test] #[test]
fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok()); assert!(test_operation_strategy(&ops[..], true, true).is_ok());
} }
#[test] #[test]
fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok()); assert!(test_operation_strategy(&ops[..], false, true).is_ok());
} }
#[test]
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
}
#[test]
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
}
#[test]
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
}
#[test]
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
}
} }
#[test] #[test]
@@ -1676,11 +1634,11 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// We add a doc... // We add a doc...
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?; index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
// And remove it. // And remove it.
index_writer.delete_term(Term::from_field_u64(id_field, 0u64)); index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
// We add another doc. // We add another doc.
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?; index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
// The expected result is a segment with // The expected result is a segment with
// maxdoc = 2 // maxdoc = 2
@@ -1697,14 +1655,14 @@ mod tests {
} }
#[test] #[test]
fn test_index_doc_missing_field() -> crate::Result<()> { fn test_index_doc_missing_field() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let idfield = schema_builder.add_text_field("id", STRING); let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING); schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid"))?; index_writer.add_document(doc!(idfield=>"myid"));
index_writer.commit()?; let commit = index_writer.commit();
Ok(()) assert!(commit.is_ok());
} }
} }

View File

@@ -1,118 +0,0 @@
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use super::AddBatchReceiver;
#[derive(Clone)]
pub(crate) struct IndexWriterStatus {
inner: Arc<Inner>,
}
impl IndexWriterStatus {
/// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive()
}
/// Returns a copy of the operation receiver.
/// If the index writer was killed, returns None.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
let rlock = self
.inner
.receive_channel
.read()
.expect("This lock should never be poisoned");
rlock.as_ref().cloned()
}
/// Create an index writer bomb.
/// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
IndexWriterBomb {
inner: Some(self.inner.clone()),
}
}
}
struct Inner {
is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver>>,
}
impl Inner {
fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed)
}
fn kill(&self) {
self.is_alive.store(false, Ordering::Relaxed);
self.receive_channel
.write()
.expect("This lock should never be poisoned")
.take();
}
}
impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver) -> Self {
IndexWriterStatus {
inner: Arc::new(Inner {
is_alive: AtomicBool::new(true),
receive_channel: RwLock::new(Some(receiver)),
}),
}
}
}
/// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner>>,
}
impl IndexWriterBomb {
/// Defuses the bomb.
///
/// This is the only way to drop the bomb without killing
/// the index writer.
pub fn defuse(mut self) {
self.inner = None;
}
}
impl Drop for IndexWriterBomb {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.kill();
}
}
}
#[cfg(test)]
mod tests {
use super::IndexWriterStatus;
use crossbeam::channel;
use std::mem;
#[test]
fn test_bomb_goes_boom() {
let (_tx, rx) = channel::bounded(10);
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
assert!(index_writer_status.operation_receiver().is_some());
let bomb = index_writer_status.create_bomb();
assert!(index_writer_status.operation_receiver().is_some());
mem::drop(bomb);
// boom!
assert!(index_writer_status.operation_receiver().is_none());
}
#[test]
fn test_bomb_defused() {
let (_tx, rx) = channel::bounded(10);
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
assert!(index_writer_status.operation_receiver().is_some());
let bomb = index_writer_status.create_bomb();
bomb.defuse();
assert!(index_writer_status.operation_receiver().is_some());
}
}

View File

@@ -2,15 +2,12 @@ use super::merge_policy::{MergeCandidate, MergePolicy};
use crate::core::SegmentMeta; use crate::core::SegmentMeta;
use itertools::Itertools; use itertools::Itertools;
use std::cmp; use std::cmp;
use std::f64;
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75; const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000; const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8; const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000; const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
// The default value of 1 means that deletes are not taken in account when
// identifying merge candidates. This is not a very sensible default: it was
// set like that for backward compatibility and might change in the near future.
const DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE: f32 = 1.0f32;
/// `LogMergePolicy` tries to merge segments that have a similar number of /// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents. /// documents.
@@ -20,7 +17,6 @@ pub struct LogMergePolicy {
max_docs_before_merge: usize, max_docs_before_merge: usize,
min_layer_size: u32, min_layer_size: u32,
level_log_size: f64, level_log_size: f64,
del_docs_ratio_before_merge: f32,
} }
impl LogMergePolicy { impl LogMergePolicy {
@@ -56,49 +52,19 @@ impl LogMergePolicy {
pub fn set_level_log_size(&mut self, level_log_size: f64) { pub fn set_level_log_size(&mut self, level_log_size: f64) {
self.level_log_size = level_log_size; self.level_log_size = level_log_size;
} }
/// Set the ratio of deleted documents in a segment to tolerate.
///
/// If it is exceeded by any segment at a log level, a merge
/// will be triggered for that level.
///
/// If there is a single segment at a level, we effectively end up expunging
/// deleted documents from it.
///
/// # Panics
///
/// Panics if del_docs_ratio_before_merge is not within (0..1].
pub fn set_del_docs_ratio_before_merge(&mut self, del_docs_ratio_before_merge: f32) {
assert!(del_docs_ratio_before_merge <= 1.0f32);
assert!(del_docs_ratio_before_merge > 0f32);
self.del_docs_ratio_before_merge = del_docs_ratio_before_merge;
}
fn has_segment_above_deletes_threshold(&self, level: &[&SegmentMeta]) -> bool {
level
.iter()
.any(|segment| deletes_ratio(segment) > self.del_docs_ratio_before_merge)
}
}
fn deletes_ratio(segment: &SegmentMeta) -> f32 {
if segment.max_doc() == 0 {
return 0f32;
}
segment.num_deleted_docs() as f32 / segment.max_doc() as f32
} }
impl MergePolicy for LogMergePolicy { impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> { fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
let size_sorted_segments = segments let mut size_sorted_segments = segments
.iter() .iter()
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32)) .filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
.collect::<Vec<&SegmentMeta>>(); .collect::<Vec<&SegmentMeta>>();
if size_sorted_segments.is_empty() { if size_sorted_segments.len() <= 1 {
return vec![]; return vec![];
} }
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
let mut current_max_log_size = f64::MAX; let mut current_max_log_size = f64::MAX;
let mut levels = vec![]; let mut levels = vec![];
@@ -116,10 +82,7 @@ impl MergePolicy for LogMergePolicy {
levels levels
.iter() .iter()
.filter(|level| { .filter(|level| level.len() >= self.min_num_segments)
level.len() >= self.min_num_segments
|| self.has_segment_above_deletes_threshold(level)
})
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect())) .map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
.collect() .collect()
} }
@@ -132,7 +95,6 @@ impl Default for LogMergePolicy {
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE, max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
min_layer_size: DEFAULT_MIN_LAYER_SIZE, min_layer_size: DEFAULT_MIN_LAYER_SIZE,
level_log_size: DEFAULT_LEVEL_LOG_SIZE, level_log_size: DEFAULT_LEVEL_LOG_SIZE,
del_docs_ratio_before_merge: DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE,
} }
} }
} }
@@ -152,7 +114,7 @@ mod tests {
use crate::Index; use crate::Index;
#[test] #[test]
fn create_index_test_max_merge_issue_1035() -> crate::Result<()> { fn create_index_test_max_merge_issue_1035() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intval", INDEXED); let int_field = schema_builder.add_u64_field("intval", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -165,34 +127,34 @@ mod tests {
log_merge_policy.set_max_docs_before_merge(1); log_merge_policy.set_max_docs_before_merge(1);
log_merge_policy.set_min_layer_size(0); log_merge_policy.set_min_layer_size(0);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));
// after every commit the merge checker is started, it will merge only segments with 1 // after every commit the merge checker is started, it will merge only segments with 1
// element in it because of the max_merge_size. // element in it because of the max_merge_size.
index_writer.add_document(doc!(int_field=>1_u64))?; index_writer.add_document(doc!(int_field=>1_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>2_u64))?; index_writer.add_document(doc!(int_field=>2_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>3_u64))?; index_writer.add_document(doc!(int_field=>3_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>4_u64))?; index_writer.add_document(doc!(int_field=>4_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64))?; index_writer.add_document(doc!(int_field=>5_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>6_u64))?; index_writer.add_document(doc!(int_field=>6_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>7_u64))?; index_writer.add_document(doc!(int_field=>7_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>8_u64))?; index_writer.add_document(doc!(int_field=>8_u64));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let _segment_ids = index let _segment_ids = index
@@ -207,7 +169,6 @@ mod tests {
panic!("segment can't have more than two segments"); panic!("segment can't have more than two segments");
} // don't know how to wait for the merge, then it could be a simple eq } // don't know how to wait for the merge, then it could be a simple eq
} }
Ok(())
} }
fn test_merge_policy() -> LogMergePolicy { fn test_merge_policy() -> LogMergePolicy {
@@ -326,49 +287,4 @@ mod tests {
assert_eq!(result_list[0].0[1], test_input[4].id()); assert_eq!(result_list[0].0[1], test_input[4].id());
assert_eq!(result_list[0].0[2], test_input[5].id()); assert_eq!(result_list[0].0[2], test_input[5].id());
} }
#[test]
fn test_merge_single_segment_with_deletes_below_threshold() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_000, 1)];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert!(merge_candidates.is_empty());
}
#[test]
fn test_merge_single_segment_with_deletes_above_threshold() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_001, 1)];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
}
#[test]
fn test_merge_segments_with_deletes_above_threshold_all_in_level() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
create_random_segment_meta(40_000),
];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
assert_eq!(merge_candidates[0].0.len(), 2);
}
#[test]
fn test_merge_segments_with_deletes_above_threshold_different_level_not_involved() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![
create_random_segment_meta(100),
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
assert_eq!(merge_candidates[0].0.len(), 1);
assert_eq!(merge_candidates[0].0[0], test_input[1].id());
}
} }

View File

@@ -1,6 +1,6 @@
use crate::Opstamp; use crate::Opstamp;
use crate::SegmentId; use crate::SegmentId;
use crate::{Inventory, TrackedObject}; use census::{Inventory, TrackedObject};
use std::collections::HashSet; use std::collections::HashSet;
use std::ops::Deref; use std::ops::Deref;

View File

@@ -1,5 +1,4 @@
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::AliveBitSet;
use crate::fastfield::CompositeFastFieldSerializer; use crate::fastfield::CompositeFastFieldSerializer;
use crate::fastfield::DynamicFastFieldReader; use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldDataAccess; use crate::fastfield::FastFieldDataAccess;
@@ -10,7 +9,7 @@ use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer; use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter; use crate::fieldnorm::FieldNormsWriter;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping; use crate::indexer::doc_id_mapping::SegmentDocidMapping;
use crate::indexer::SegmentSerializer; use crate::indexer::SegmentSerializer;
use crate::postings::Postings; use crate::postings::Postings;
use crate::postings::{InvertedIndexSerializer, SegmentPostings}; use crate::postings::{InvertedIndexSerializer, SegmentPostings};
@@ -41,54 +40,31 @@ use tantivy_bitpacker::minmax;
/// We do not allow segments with more than /// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31; pub const MAX_DOC_LIMIT: u32 = 1 << 31;
fn estimate_total_num_tokens_in_single_segment( fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
reader: &SegmentReader, let mut total_tokens = 0u64;
field: Field, let mut count: [usize; 256] = [0; 256];
) -> crate::Result<u64> { for reader in readers {
// There are no deletes. We can simply use the exact value saved into the posting list. if reader.has_deletes() {
// Note that this value is not necessarily exact as it could have been the result of a merge between // if there are deletes, then we use an approximation
// segments themselves containing deletes. // using the fieldnorm
if !reader.has_deletes() { let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
return Ok(reader.inverted_index(field)?.total_num_tokens()); for doc in reader.doc_ids_alive() {
} let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
// When there are deletes, we use an approximation either }
// by using the fieldnorm. } else {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { total_tokens += reader.inverted_index(field)?.total_num_tokens();
let mut count: [usize; 256] = [0; 256];
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
} }
let total_num_tokens = count }
Ok(total_tokens
+ count
.iter() .iter()
.cloned() .cloned()
.enumerate() .enumerate()
.map(|(fieldnorm_ord, count)| { .map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}) })
.sum::<u64>(); .sum::<u64>())
return Ok(total_num_tokens);
}
// There are no fieldnorms available.
// Here we just do a pro-rata with the overall number of tokens an the ratio of
// documents alive.
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
if reader.max_doc() == 0 {
// That supposedly never happens, but let's be a bit defensive here.
return Ok(0u64);
}
let ratio = reader.num_docs() as f64 / reader.max_doc() as f64;
Ok((segment_num_tokens as f64 * ratio) as u64)
}
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_num_tokens: u64 = 0;
for reader in readers {
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?;
}
Ok(total_num_tokens)
} }
pub struct IndexMerger { pub struct IndexMerger {
@@ -180,38 +156,16 @@ impl IndexMerger {
schema: Schema, schema: Schema,
index_settings: IndexSettings, index_settings: IndexSettings,
segments: &[Segment], segments: &[Segment],
) -> crate::Result<IndexMerger> {
let delete_bitsets = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, index_settings, segments, delete_bitsets)
}
// Create merge with a custom delete set.
// For every Segment, a delete bitset can be provided, which
// will be merged with the existing bit set. Make sure the index
// corresponds to the segment index.
//
// If `None` is provided for custom alive set, the regular alive set will be used.
// If a delete_bitsets is provided, the union between the provided and regular
// alive set will be used.
//
// This can be used to merge but also apply an additional filter.
// One use case is demux, which is basically taking a list of
// segments and partitions them e.g. by a value in a field.
pub fn open_with_custom_alive_set(
schema: Schema,
index_settings: IndexSettings,
segments: &[Segment],
alive_bitset_opt: Vec<Option<AliveBitSet>>,
) -> crate::Result<IndexMerger> { ) -> crate::Result<IndexMerger> {
let mut readers = vec![]; let mut readers = vec![];
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt.into_iter()) { let mut max_doc: u32 = 0u32;
for segment in segments {
if segment.meta().num_docs() > 0 { if segment.meta().num_docs() > 0 {
let reader = let reader = SegmentReader::open(segment)?;
SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?; max_doc += reader.num_docs();
readers.push(reader); readers.push(reader);
} }
} }
let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() { if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?; readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
} }
@@ -259,7 +213,7 @@ impl IndexMerger {
fn write_fieldnorms( fn write_fieldnorms(
&self, &self,
mut fieldnorms_serializer: FieldNormsSerializer, mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
@@ -287,17 +241,17 @@ impl IndexMerger {
&self, &self,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>, mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
debug_time!("write_fast_fields"); debug_time!("write_fast_fields");
for (field, field_entry) in self.schema.fields() { for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
match field_type { match field_type {
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
let term_ordinal_mapping = term_ord_mappings let term_ordinal_mapping = term_ord_mappings
.remove(&field) .remove(&field)
.expect("Logic Error in Tantivy (Please report). Facet field should have required a\ .expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\
`term_ordinal_mapping`."); `term_ordinal_mapping`.");
self.write_hierarchical_facet_field( self.write_hierarchical_facet_field(
field, field,
@@ -338,15 +292,16 @@ impl IndexMerger {
&self, &self,
field: Field, field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
let (min_value, max_value) = self.readers.iter().filter_map(|reader|{ let (min_value, max_value) = self.readers.iter().map(|reader|{
let u64_reader: DynamicFastFieldReader<u64> = reader let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields() .fast_fields()
.typed_fast_field_reader(field) .typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
compute_min_max_val(&u64_reader, reader) compute_min_max_val(&u64_reader, reader)
}) })
.flatten()
.reduce(|a, b| { .reduce(|a, b| {
(a.0.min(b.0), a.1.max(b.1)) (a.0.min(b.0), a.1.max(b.1))
}).expect("Unexpected error, empty readers in IndexMerger"); }).expect("Unexpected error, empty readers in IndexMerger");
@@ -369,17 +324,17 @@ impl IndexMerger {
num_vals: doc_id_mapping.len() as u64, num_vals: doc_id_mapping.len() as u64,
}; };
#[derive(Clone)] #[derive(Clone)]
struct SortedDocIdFieldAccessProvider<'a> { struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocIdMapping, doc_id_mapping: &'a SegmentDocidMapping,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>, fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
} }
impl<'a> FastFieldDataAccess for SortedDocIdFieldAccessProvider<'a> { impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
let (doc_id, reader_ordinal) = self.doc_id_mapping[doc as usize]; let (doc_id, reader_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_ordinal as usize].get(doc_id) self.fast_field_readers[reader_ordinal as usize].get(doc_id)
} }
} }
let fastfield_accessor = SortedDocIdFieldAccessProvider { let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping, doc_id_mapping,
fast_field_readers: &fast_field_readers, fast_field_readers: &fast_field_readers,
}; };
@@ -434,9 +389,9 @@ impl IndexMerger {
Ok(value_accessor) Ok(value_accessor)
} }
/// Collecting value_accessors into a vec to bind the lifetime. /// Collecting value_accessors into a vec to bind the lifetime.
pub(crate) fn get_reader_with_sort_field_accessor( pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
&self, &'a self,
sort_by_field: &IndexSortByField, sort_by_field: &'b IndexSortByField,
) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> { ) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> {
let reader_ordinal_and_field_accessors = self let reader_ordinal_and_field_accessors = self
.readers .readers
@@ -461,7 +416,7 @@ impl IndexMerger {
pub(crate) fn generate_doc_id_mapping( pub(crate) fn generate_doc_id_mapping(
&self, &self,
sort_by_field: &IndexSortByField, sort_by_field: &IndexSortByField,
) -> crate::Result<SegmentDocIdMapping> { ) -> crate::Result<SegmentDocidMapping> {
let reader_ordinal_and_field_accessors = let reader_ordinal_and_field_accessors =
self.get_reader_with_sort_field_accessor(sort_by_field)?; self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression // Loading the field accessor on demand causes a 15x regression
@@ -504,7 +459,7 @@ impl IndexMerger {
}) })
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id)), .map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id)),
); );
Ok(SegmentDocIdMapping::new(sorted_doc_ids, false)) Ok(SegmentDocidMapping::new(sorted_doc_ids, false))
} }
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and // Creating the index file to point into the data, generic over `BytesFastFieldReader` and
@@ -513,7 +468,7 @@ impl IndexMerger {
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>( fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
field: Field, field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
reader_and_field_accessors: &[(&SegmentReader, T)], reader_and_field_accessors: &[(&SegmentReader, T)],
) -> crate::Result<Vec<u64>> { ) -> crate::Result<Vec<u64>> {
let mut total_num_vals = 0u64; let mut total_num_vals = 0u64;
@@ -572,7 +527,7 @@ impl IndexMerger {
&self, &self,
field: Field, field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Vec<u64>> { ) -> crate::Result<Vec<u64>> {
let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{ let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields() let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
@@ -594,7 +549,7 @@ impl IndexMerger {
field: Field, field: Field,
term_ordinal_mappings: &TermOrdinalMapping, term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
debug_time!("write_hierarchical_facet_field"); debug_time!("write_hierarchical_facet_field");
@@ -643,7 +598,7 @@ impl IndexMerger {
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index /// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
/// sorting and the others /// sorting and the others
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> { pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocidMapping> {
let total_num_new_docs = self let total_num_new_docs = self
.readers .readers
.iter() .iter()
@@ -656,19 +611,20 @@ impl IndexMerger {
self.readers self.readers
.iter() .iter()
.enumerate() .enumerate()
.flat_map(|(reader_ordinal, reader)| { .map(|(reader_ordinal, reader)| {
reader reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal)) .map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal))
}), })
.flatten(),
); );
Ok(SegmentDocIdMapping::new(mapping, true)) Ok(SegmentDocidMapping::new(mapping, true))
} }
fn write_multi_fast_field( fn write_multi_fast_field(
&self, &self,
field: Field, field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields. // Multifastfield consists in 2 fastfields.
// The first serves as an index into the second one and is stricly increasing. // The first serves as an index into the second one and is stricly increasing.
@@ -724,16 +680,16 @@ impl IndexMerger {
min_value, min_value,
}; };
struct SortedDocIdMultiValueAccessProvider<'a> { struct SortedDocidMultiValueAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocIdMapping, doc_id_mapping: &'a SegmentDocidMapping,
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>, fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
offsets: Vec<u64>, offsets: Vec<u64>,
} }
impl<'a> FastFieldDataAccess for SortedDocIdMultiValueAccessProvider<'a> { impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> {
fn get_val(&self, pos: u64) -> u64 { fn get_val(&self, pos: u64) -> u64 {
// use the offsets index to find the doc_id which will contain the position. // use the offsets index to find the doc_id which will contain the position.
// the offsets are stricly increasing so we can do a simple search on it. // the offsets are stricly increasing so we can do a simple search on it.
let new_doc_id = self let new_docid = self
.offsets .offsets
.iter() .iter()
.position(|&offset| offset > pos) .position(|&offset| offset > pos)
@@ -741,10 +697,10 @@ impl IndexMerger {
- 1; - 1;
// now we need to find the position of `pos` in the multivalued bucket // now we need to find the position of `pos` in the multivalued bucket
let num_pos_covered_until_now = self.offsets[new_doc_id]; let num_pos_covered_until_now = self.offsets[new_docid];
let pos_in_values = pos - num_pos_covered_until_now; let pos_in_values = pos - num_pos_covered_until_now;
let (old_doc_id, reader_ordinal) = self.doc_id_mapping[new_doc_id as usize]; let (old_doc_id, reader_ordinal) = self.doc_id_mapping[new_docid as usize];
let num_vals = self.fast_field_readers[reader_ordinal as usize].get_len(old_doc_id); let num_vals = self.fast_field_readers[reader_ordinal as usize].get_len(old_doc_id);
assert!(num_vals >= pos_in_values); assert!(num_vals >= pos_in_values);
let mut vals = vec![]; let mut vals = vec![];
@@ -753,23 +709,29 @@ impl IndexMerger {
vals[pos_in_values as usize] vals[pos_in_values as usize]
} }
} }
let fastfield_accessor = SortedDocIdMultiValueAccessProvider { let fastfield_accessor = SortedDocidMultiValueAccessProvider {
doc_id_mapping, doc_id_mapping,
fast_field_readers: &ff_readers, fast_field_readers: &ff_readers,
offsets, offsets,
}; };
let iter1 = doc_id_mapping.iter().flat_map(|(doc_id, reader_ordinal)| { let iter1 = doc_id_mapping
let ff_reader = &ff_readers[*reader_ordinal as usize]; .iter()
let mut vals = vec![]; .map(|(doc_id, reader_ordinal)| {
ff_reader.get_vals(*doc_id, &mut vals); let ff_reader = &ff_readers[*reader_ordinal as usize];
vals.into_iter() let mut vals = vec![];
}); ff_reader.get_vals(*doc_id, &mut vals);
let iter2 = doc_id_mapping.iter().flat_map(|(doc_id, reader_ordinal)| { vals.into_iter()
let ff_reader = &ff_readers[*reader_ordinal as usize]; })
let mut vals = vec![]; .flatten();
ff_reader.get_vals(*doc_id, &mut vals); let iter2 = doc_id_mapping
vals.into_iter() .iter()
}); .map(|(doc_id, reader_ordinal)| {
let ff_reader = &ff_readers[*reader_ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
})
.flatten();
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx( fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
field, field,
stats, stats,
@@ -786,7 +748,7 @@ impl IndexMerger {
&self, &self,
field: Field, field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
let reader_and_field_accessors = self let reader_and_field_accessors = self
.readers .readers
@@ -822,7 +784,7 @@ impl IndexMerger {
field_type: &FieldType, field_type: &FieldType,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Option<TermOrdinalMapping>> { ) -> crate::Result<Option<TermOrdinalMapping>> {
debug_time!("write_postings_for_field"); debug_time!("write_postings_for_field");
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000); let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
@@ -844,7 +806,7 @@ impl IndexMerger {
} }
let mut term_ord_mapping_opt = match field_type { let mut term_ord_mapping_opt = match field_type {
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)), FieldType::HierarchicalFacet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
_ => None, _ => None,
}; };
@@ -861,14 +823,15 @@ impl IndexMerger {
segment_local_map segment_local_map
}) })
.collect(); .collect();
for (new_doc_id, (old_doc_id, segment_ord)) in doc_id_mapping.iter().enumerate() { for (new_doc_id, (old_doc_id, segment_ordinal)) in doc_id_mapping.iter().enumerate() {
let segment_map = &mut merged_doc_id_map[*segment_ord as usize]; let segment_map = &mut merged_doc_id_map[*segment_ordinal as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId); segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
} }
// Note that the total number of tokens is not exact. // The total number of tokens will only be exact when there has been no deletes.
// It is only used as a parameter in the BM25 formula. //
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?; // Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
// Create the total list of doc ids // Create the total list of doc ids
// by stacking the doc ids from the different segment. // by stacking the doc ids from the different segment.
@@ -903,7 +866,7 @@ impl IndexMerger {
let mut total_doc_freq = 0; let mut total_doc_freq = 0;
// Let's compute the list of non-empty posting lists // Let's compute the list of non-empty posting lists
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() { for (segment_ord, term_info) in merged_terms.current_segment_ordinals_and_term_infos() {
let segment_reader = &self.readers[segment_ord]; let segment_reader = &self.readers[segment_ord];
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord]; let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
let segment_postings = inverted_index let segment_postings = inverted_index
@@ -953,9 +916,9 @@ impl IndexMerger {
// there is at least one document. // there is at least one document.
let term_freq = segment_postings.term_freq(); let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer); segment_postings.positions(&mut positions_buffer);
// if doc_id_mapping exists, the doc_ids are reordered, they are // if doc_id_mapping exists, the docids are reordered, they are
// not just stacked. The field serializer expects monotonically increasing // not just stacked. The field serializer expects monotonically increasing
// doc_ids, so we collect and sort them first, before writing. // docids, so we collect and sort them first, before writing.
// //
// I think this is not strictly necessary, it would be possible to // I think this is not strictly necessary, it would be possible to
// avoid the loading into a vec via some form of kmerge, but then the merge // avoid the loading into a vec via some form of kmerge, but then the merge
@@ -995,7 +958,7 @@ impl IndexMerger {
&self, &self,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> { ) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
let mut term_ordinal_mappings = HashMap::new(); let mut term_ordinal_mappings = HashMap::new();
for (field, field_entry) in self.schema.fields() { for (field, field_entry) in self.schema.fields() {
@@ -1018,7 +981,7 @@ impl IndexMerger {
fn write_storable_fields( fn write_storable_fields(
&self, &self,
store_writer: &mut StoreWriter, store_writer: &mut StoreWriter,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
debug_time!("write_storable_fields"); debug_time!("write_storable_fields");
@@ -1060,7 +1023,7 @@ impl IndexMerger {
// the doc stores would be on average half full, given total randomness (which // the doc stores would be on average half full, given total randomness (which
// is not the case here, but not sure how it behaves exactly). // is not the case here, but not sure how it behaves exactly).
// //
// https://github.com/quickwit-inc/tantivy/issues/1053 // https://github.com/tantivy-search/tantivy/issues/1053
// //
// take 7 in order to not walk over all checkpoints. // take 7 in order to not walk over all checkpoints.
|| store_reader.block_checkpoints().take(7).count() < 6 || store_reader.block_checkpoints().take(7).count() < 6
@@ -1132,13 +1095,13 @@ mod tests {
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::query::Scorer; use crate::query::Scorer;
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::Document;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::IntOptions; use crate::schema::IntOptions;
use crate::schema::Term; use crate::schema::Term;
use crate::schema::TextFieldIndexing; use crate::schema::TextFieldIndexing;
use crate::schema::{Cardinality, TEXT}; use crate::schema::{Cardinality, TEXT};
use crate::schema::{Document, FacetOptions};
use crate::DocAddress; use crate::DocAddress;
use crate::IndexSettings; use crate::IndexSettings;
use crate::IndexSortByField; use crate::IndexSortByField;
@@ -1176,17 +1139,18 @@ mod tests {
score_field => 3u64, score_field => 3u64,
date_field => curr_time, date_field => curr_time,
bytes_score_field => 3u32.to_be_bytes().as_ref() bytes_score_field => 3u32.to_be_bytes().as_ref()
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "a b c", text_field => "a b c",
score_field => 5u64, score_field => 5u64,
bytes_score_field => 5u32.to_be_bytes().as_ref() bytes_score_field => 5u32.to_be_bytes().as_ref()
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "a b c d", text_field => "a b c d",
score_field => 7u64, score_field => 7u64,
bytes_score_field => 7u32.to_be_bytes().as_ref() bytes_score_field => 7u32.to_be_bytes().as_ref()
))?; ));
index_writer.commit()?; index_writer.commit()?;
// writing the segment // writing the segment
index_writer.add_document(doc!( index_writer.add_document(doc!(
@@ -1194,12 +1158,12 @@ mod tests {
date_field => curr_time, date_field => curr_time,
score_field => 11u64, score_field => 11u64,
bytes_score_field => 11u32.to_be_bytes().as_ref() bytes_score_field => 11u32.to_be_bytes().as_ref()
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "a b c g", text_field => "a b c g",
score_field => 13u64, score_field => 13u64,
bytes_score_field => 13u32.to_be_bytes().as_ref() bytes_score_field => 13u32.to_be_bytes().as_ref()
))?; ));
index_writer.commit()?; index_writer.commit()?;
} }
{ {
@@ -1333,18 +1297,18 @@ mod tests {
text_field => "a b d", text_field => "a b d",
score_field => 1u64, score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1], bytes_score_field => vec![0u8, 0, 0, 1],
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "b c", text_field => "b c",
score_field => 2u64, score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2], bytes_score_field => vec![0u8, 0, 0, 2],
))?; ));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "c d", text_field => "c d",
score_field => 3u64, score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3], bytes_score_field => vec![0u8, 0, 0, 3],
))?; ));
index_writer.commit()?; index_writer.commit()?;
reader.reload()?; reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -1374,24 +1338,24 @@ mod tests {
text_field => "a d e", text_field => "a d e",
score_field => 4_000u64, score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4], bytes_score_field => vec![0u8, 0, 0, 4],
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "e f", text_field => "e f",
score_field => 5_000u64, score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5], bytes_score_field => vec![0u8, 0, 0, 5],
))?; ));
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f")); index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "f g", text_field => "f g",
score_field => 6_000u64, score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112], bytes_score_field => vec![0u8, 0, 23, 112],
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "g h", text_field => "g h",
score_field => 7_000u64, score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88], bytes_score_field => vec![0u8, 0, 27, 88],
))?; ));
index_writer.commit()?; index_writer.commit()?;
reader.reload()?; reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -1609,7 +1573,7 @@ mod tests {
#[test] #[test]
fn test_merge_facets_sort_asc() { fn test_merge_facets_sort_asc() {
// In the merge case this will go through the doc_id mapping code // In the merge case this will go through the docid mapping code
test_merge_facets( test_merge_facets(
Some(IndexSettings { Some(IndexSettings {
sort_by_field: Some(IndexSortByField { sort_by_field: Some(IndexSortByField {
@@ -1620,7 +1584,7 @@ mod tests {
}), }),
true, true,
); );
// In the merge case this will not go through the doc_id mapping code, because the data is // In the merge case this will not go through the docid mapping code, because the data is
// sorted and disjunct // sorted and disjunct
test_merge_facets( test_merge_facets(
Some(IndexSettings { Some(IndexSettings {
@@ -1636,7 +1600,7 @@ mod tests {
#[test] #[test]
fn test_merge_facets_sort_desc() { fn test_merge_facets_sort_desc() {
// In the merge case this will go through the doc_id mapping code // In the merge case this will go through the docid mapping code
test_merge_facets( test_merge_facets(
Some(IndexSettings { Some(IndexSettings {
sort_by_field: Some(IndexSortByField { sort_by_field: Some(IndexSortByField {
@@ -1647,7 +1611,7 @@ mod tests {
}), }),
true, true,
); );
// In the merge case this will not go through the doc_id mapping code, because the data is // In the merge case this will not go through the docid mapping code, because the data is
// sorted and disjunct // sorted and disjunct
test_merge_facets( test_merge_facets(
Some(IndexSettings { Some(IndexSettings {
@@ -1664,7 +1628,7 @@ mod tests {
// ranges between segments so that merge algorithm can't apply certain optimizations // ranges between segments so that merge algorithm can't apply certain optimizations
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) { fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::SingleValue) .set_fast(Cardinality::SingleValue)
.set_indexed(); .set_indexed();
@@ -1687,7 +1651,7 @@ mod tests {
} }
doc.add_u64(int_field, *int_val); doc.add_u64(int_field, *int_val);
*int_val += 1; *int_val += 1;
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
}; };
index_doc( index_doc(
@@ -1800,69 +1764,70 @@ mod tests {
} }
#[test] #[test]
fn test_bug_merge() -> crate::Result<()> { fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED); let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64))?; index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64))?; index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1)); index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
block_on(index_writer.merge(&segment_ids))?; block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
reader.reload()?; reader.reload().unwrap();
// commit has not been called yet. The document should still be // commit has not been called yet. The document should still be
// there. // there.
assert_eq!(reader.searcher().num_docs(), 2); assert_eq!(reader.searcher().num_docs(), 2);
Ok(())
} }
#[test] #[test]
fn test_merge_multivalued_int_fields_all_deleted() -> crate::Result<()> { fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_indexed(); .set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options); let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader()?; let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, 1); doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone())?; index_writer.add_document(doc.clone());
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.add_document(doc)?; index_writer.add_document(doc);
index_writer.commit()?; assert!(index_writer.commit().is_ok());
index_writer.delete_term(Term::from_field_u64(int_field, 1)); index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?; let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
// assert delete has not been committed // assert delete has not been committed
reader.reload()?; assert!(reader.reload().is_ok());
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.num_docs(), 2);
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads().unwrap();
} }
reader.reload()?; reader.reload().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 0); assert_eq!(searcher.num_docs(), 0);
Ok(())
} }
#[test] #[test]
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> { fn test_merge_multivalued_int_fields_simple() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
@@ -1871,13 +1836,13 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| { let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default(); let mut doc = Document::default();
for &val in int_vals { for &val in int_vals {
doc.add_u64(int_field, val); doc.add_u64(int_field, val);
} }
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
}; };
index_doc(&mut index_writer, &[1, 2]); index_doc(&mut index_writer, &[1, 2]);
index_doc(&mut index_writer, &[1, 2, 3]); index_doc(&mut index_writer, &[1, 2, 3]);
@@ -1893,7 +1858,7 @@ mod tests {
index_doc(&mut index_writer, &[1_000]); index_doc(&mut index_writer, &[1_000]);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut vals: Vec<u64> = Vec::new(); let mut vals: Vec<u64> = Vec::new();
@@ -1942,12 +1907,14 @@ mod tests {
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index
let mut index_writer = index.writer_for_tests()?; .searchable_segment_ids()
block_on(index_writer.merge(&segment_ids))?; .expect("Searchable segments failed.");
index_writer.wait_merging_threads()?; let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
} }
reader.reload()?; assert!(reader.reload().is_ok());
{ {
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -1984,7 +1951,6 @@ mod tests {
ff_reader.get_vals(9, &mut vals); ff_reader.get_vals(9, &mut vals);
assert_eq!(&vals, &[20]); assert_eq!(&vals, &[20]);
} }
Ok(())
} }
#[test] #[test]
@@ -2010,7 +1976,7 @@ mod tests {
doc.add_f64(field, 42.0); doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27); doc.add_f64(multi_field, 0.27);
writer.add_document(doc)?; writer.add_document(doc);
if i % 5 == 0 { if i % 5 == 0 {
writer.commit()?; writer.commit()?;
} }
@@ -2034,7 +2000,7 @@ mod tests {
let happy_term = Term::from_field_text(text, "happy"); let happy_term = Term::from_field_text(text, "happy");
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs); let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
for _ in 0..62 { for _ in 0..62 {
writer.add_document(doc!(text=>"hello happy tax payer"))?; writer.add_document(doc!(text=>"hello happy tax payer"));
} }
writer.commit()?; writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
@@ -2046,7 +2012,7 @@ mod tests {
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855); assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
assert_nearly_equals!(term_scorer.score(), 0.0079681855); assert_nearly_equals!(term_scorer.score(), 0.0079681855);
for _ in 0..81 { for _ in 0..81 {
writer.add_document(doc!(text=>"hello happy tax payer"))?; writer.add_document(doc!(text=>"hello happy tax payer"));
} }
writer.commit()?; writer.commit()?;
reader.reload()?; reader.reload()?;

View File

@@ -1,17 +1,22 @@
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{AliveBitSet, FastFieldReader}; use crate::fastfield::{AliveBitSet, FastFieldReader};
use crate::query::QueryParser; use crate::schema::IndexRecordOption;
use crate::schema::{ use crate::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing, collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},
};
use crate::{core::Index, fastfield::MultiValuedFastFieldReader};
use crate::{
query::QueryParser,
schema::{IntOptions, TextOptions},
};
use crate::{schema::Facet, IndexSortByField};
use crate::{schema::INDEXED, Order};
use crate::{
schema::{self, BytesOptions},
DocAddress,
}; };
use crate::schema::{IntOptions, TextOptions};
use crate::DocAddress;
use crate::IndexSortByField;
use crate::Order;
use crate::{DocSet, IndexSettings, Postings, Term}; use crate::{DocSet, IndexSettings, Postings, Term};
use futures::executor::block_on; use futures::executor::block_on;
@@ -22,7 +27,7 @@ mod tests {
.set_indexed(); .set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options); let int_field = schema_builder.add_u64_field("intval", int_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -34,17 +39,14 @@ mod tests {
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime"))) index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
.unwrap(); index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
index_writer
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime"))) assert!(index_writer.commit().is_ok());
.unwrap(); index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
index_writer.commit().unwrap();
index_writer assert!(index_writer.commit().is_ok());
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
.unwrap();
index_writer.commit().unwrap();
} }
// Merging the segments // Merging the segments
@@ -64,7 +66,7 @@ mod tests {
fn create_test_index( fn create_test_index(
index_settings: Option<IndexSettings>, index_settings: Option<IndexSettings>,
force_disjunct_segment_sort_values: bool, force_disjunct_segment_sort_values: bool,
) -> crate::Result<Index> { ) -> Index {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::SingleValue) .set_fast(Cardinality::SingleValue)
@@ -74,7 +76,7 @@ mod tests {
let bytes_options = BytesOptions::default().set_fast().set_indexed(); let bytes_options = BytesOptions::default().set_fast().set_indexed();
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options); let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let multi_numbers = schema_builder.add_u64_field( let multi_numbers = schema_builder.add_u64_field(
"multi_numbers", "multi_numbers",
@@ -93,34 +95,34 @@ mod tests {
if let Some(settings) = index_settings { if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings); index_builder = index_builder.settings(settings);
} }
let index = index_builder.create_in_ram()?; let index = index_builder.create_in_ram().unwrap();
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
// segment 1 - range 1-3 // segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64))?; index_writer.add_document(doc!(int_field=>1_u64));
index_writer.add_document( index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")), doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
)?; );
index_writer.add_document( index_writer.add_document(
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"), doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
)?; );
index_writer.add_document( index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"), doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
)?; );
index_writer.commit()?; assert!(index_writer.commit().is_ok());
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20 // segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?; index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
let in_val = if force_disjunct_segment_sort_values { let in_val = if force_disjunct_segment_sort_values {
10_u64 10_u64
} else { } else {
1 1
}; };
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?; index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000 // segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values { let int_vals = if force_disjunct_segment_sort_values {
[100_u64, 50] [100_u64, 50]
@@ -129,24 +131,26 @@ mod tests {
}; };
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1] index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")), doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
)?; );
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?; index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
index_writer.add_document( index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num") doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
)?; );
index_writer.delete_term(Term::from_field_text(text_field, "deleteme")); index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index
let mut index_writer = index.writer_for_tests()?; .searchable_segment_ids()
block_on(index_writer.merge(&segment_ids))?; .expect("Searchable segments failed.");
index_writer.wait_merging_threads()?; let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
} }
Ok(index) index
} }
#[test] #[test]
@@ -179,8 +183,7 @@ mod tests {
..Default::default() ..Default::default()
}), }),
force_disjunct_segment_sort_values, force_disjunct_segment_sort_values,
) );
.unwrap();
let int_field = index.schema().get_field("intval").unwrap(); let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
@@ -297,8 +300,7 @@ mod tests {
..Default::default() ..Default::default()
}), }),
false, false,
) );
.unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -365,8 +367,7 @@ mod tests {
..Default::default() ..Default::default()
}), }),
false, false,
) );
.unwrap();
let int_field = index.schema().get_field("intval").unwrap(); let int_field = index.schema().get_field("intval").unwrap();
let multi_numbers = index.schema().get_field("multi_numbers").unwrap(); let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
@@ -516,7 +517,7 @@ mod bench_sorted_index_merge {
let index_doc = |index_writer: &mut IndexWriter, val: u64| { let index_doc = |index_writer: &mut IndexWriter, val: u64| {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, val); doc.add_u64(int_field, val);
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
}; };
// 3 segments with 10_000 values in the fast fields // 3 segments with 10_000 values in the fast fields
for _ in 0..3 { for _ in 0..3 {
@@ -553,7 +554,7 @@ mod bench_sorted_index_merge {
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
(doc_id, reader, u64_reader) (doc_id, reader, u64_reader)
}); });
// add values in order of the new doc_ids // add values in order of the new docids
let mut val = 0; let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids { for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get(*doc_id); val = field_reader.get(*doc_id);
@@ -566,7 +567,7 @@ mod bench_sorted_index_merge {
Ok(()) Ok(())
} }
#[bench] #[bench]
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> { fn create_sorted_index_create_docid_mapping(b: &mut Bencher) -> crate::Result<()> {
let sort_by_field = IndexSortByField { let sort_by_field = IndexSortByField {
field: "intval".to_string(), field: "intval".to_string(),
order: Order::Desc, order: Order::Desc,

View File

@@ -1,17 +1,15 @@
pub mod delete_queue; pub mod delete_queue;
pub mod demuxer;
pub mod doc_id_mapping; pub mod doc_id_mapping;
mod doc_opstamp_mapping; mod doc_opstamp_mapping;
pub mod index_writer; pub mod index_writer;
mod index_writer_status;
mod log_merge_policy; mod log_merge_policy;
mod merge_operation; mod merge_operation;
pub mod merge_policy; pub mod merge_policy;
pub mod merger; pub mod merger;
mod merger_sorted_index_test; mod merger_sorted_index_test;
pub mod operation; pub mod operation;
pub mod prepared_commit; mod prepared_commit;
mod segment_entry; mod segment_entry;
mod segment_manager; mod segment_manager;
mod segment_register; mod segment_register;
@@ -20,11 +18,6 @@ pub mod segment_updater;
mod segment_writer; mod segment_writer;
mod stamper; mod stamper;
use crossbeam::channel;
use smallvec::SmallVec;
use crate::indexer::operation::AddOperation;
pub use self::index_writer::IndexWriter; pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy; pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation; pub use self::merge_operation::MergeOperation;
@@ -33,23 +26,12 @@ pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry; pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager; pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer; pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::merge_filtered_segments; pub use self::segment_updater::merge_segments;
pub use self::segment_updater::merge_indices;
pub use self::segment_writer::SegmentWriter; pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
// Batch of documents.
// Most of the time, users will send operation one-by-one, but it can be useful to
// send them as a small block to ensure that
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests_mmap { mod tests_mmap {
@@ -57,20 +39,19 @@ mod tests_mmap {
use crate::{Index, Term}; use crate::{Index, Term};
#[test] #[test]
fn test_advance_delete_bug() -> crate::Result<()> { fn test_advance_delete_bug() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?; let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
// there must be one deleted document in the segment // there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));
// we need enough data to trigger the bug (at least 32 documents) // we need enough data to trigger the bug (at least 32 documents)
for _ in 0..32 { for _ in 0..32 {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
} }
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.commit()?; index_writer.commit().unwrap();
Ok(())
} }
} }

View File

@@ -18,38 +18,25 @@ impl<'a> PreparedCommit<'a> {
} }
} }
/// Returns the opstamp associated to the prepared commit.
pub fn opstamp(&self) -> Opstamp { pub fn opstamp(&self) -> Opstamp {
self.opstamp self.opstamp
} }
/// Adds an arbitrary payload to the commit.
pub fn set_payload(&mut self, payload: &str) { pub fn set_payload(&mut self, payload: &str) {
self.payload = Some(payload.to_string()) self.payload = Some(payload.to_string())
} }
/// Rollbacks any change.
pub fn abort(self) -> crate::Result<Opstamp> { pub fn abort(self) -> crate::Result<Opstamp> {
self.index_writer.rollback() self.index_writer.rollback()
} }
/// Proceeds to commit.
/// See `.commit_async()`.
pub fn commit(self) -> crate::Result<Opstamp> { pub fn commit(self) -> crate::Result<Opstamp> {
block_on(self.commit_async())
}
/// Proceeds to commit.
///
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
/// this operation is not at all really light.
/// At this point deletes have not been flushed yet.
pub async fn commit_async(self) -> crate::Result<Opstamp> {
info!("committing {}", self.opstamp); info!("committing {}", self.opstamp);
self.index_writer let _ = block_on(
.segment_updater() self.index_writer
.schedule_commit(self.opstamp, self.payload) .segment_updater()
.await?; .schedule_commit(self.opstamp, self.payload),
);
Ok(self.opstamp) Ok(self.opstamp)
} }
} }

View File

@@ -66,10 +66,13 @@ impl SegmentRegister {
} }
pub fn segment_metas(&self) -> Vec<SegmentMeta> { pub fn segment_metas(&self) -> Vec<SegmentMeta> {
self.segment_states let mut segment_ids: Vec<SegmentMeta> = self
.segment_states
.values() .values()
.map(|segment_entry| segment_entry.meta().clone()) .map(|segment_entry| segment_entry.meta().clone())
.collect() .collect();
segment_ids.sort_by_key(SegmentMeta::id);
segment_ids
} }
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool { pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {

View File

@@ -7,7 +7,6 @@ use crate::core::SegmentId;
use crate::core::SegmentMeta; use crate::core::SegmentMeta;
use crate::core::META_FILEPATH; use crate::core::META_FILEPATH;
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult}; use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::fastfield::AliveBitSet;
use crate::indexer::delete_queue::DeleteCursor; use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::index_writer::advance_deletes; use crate::indexer::index_writer::advance_deletes;
use crate::indexer::merge_operation::MergeOperationInventory; use crate::indexer::merge_operation::MergeOperationInventory;
@@ -20,15 +19,12 @@ use crate::indexer::{DefaultMergePolicy, MergePolicy};
use crate::indexer::{MergeCandidate, MergeOperation}; use crate::indexer::{MergeCandidate, MergeOperation};
use crate::schema::Schema; use crate::schema::Schema;
use crate::Opstamp; use crate::Opstamp;
use crate::TantivyError;
use fail::fail_point;
use futures::channel::oneshot; use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder}; use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::Future; use futures::future::Future;
use futures::future::TryFutureExt; use futures::future::TryFutureExt;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashSet; use std::collections::HashSet;
use std::io;
use std::io::Write; use std::io::Write;
use std::ops::Deref; use std::ops::Deref;
use std::path::PathBuf; use std::path::PathBuf;
@@ -61,9 +57,7 @@ pub fn save_new_metas(
payload: None, payload: None,
}, },
directory, directory,
)?; )
directory.sync_directory()?;
Ok(())
} }
/// Save the index meta file. /// Save the index meta file.
@@ -80,11 +74,6 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
let mut buffer = serde_json::to_vec_pretty(metas)?; let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer. // Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?; writeln!(&mut buffer)?;
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
))));
directory.sync_directory()?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?; directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
Ok(()) Ok(())
@@ -170,9 +159,9 @@ fn merge(
/// meant to work if you have an IndexWriter running for the origin indices, or /// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index. /// the destination Index.
#[doc(hidden)] #[doc(hidden)]
pub fn merge_indices<T: Into<Box<dyn Directory>>>( pub fn merge_segments<Dir: Directory>(
indices: &[Index], indices: &[Index],
output_directory: T, output_directory: Dir,
) -> crate::Result<Index> { ) -> crate::Result<Index> {
if indices.is_empty() { if indices.is_empty() {
// If there are no indices to merge, there is no need to do anything. // If there are no indices to merge, there is no need to do anything.
@@ -181,8 +170,19 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
)); ));
} }
let target_schema = indices[0].schema();
let target_settings = indices[0].settings().clone(); let target_settings = indices[0].settings().clone();
// let's check that all of the indices have the same schema
if indices
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
// let's check that all of the indices have the same index settings // let's check that all of the indices have the same index settings
if indices if indices
.iter() .iter()
@@ -199,61 +199,13 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
segments.extend(index.searchable_segments()?); segments.extend(index.searchable_segments()?);
} }
let non_filter = segments.iter().map(|_| None).collect::<Vec<_>>(); let mut merged_index = Index::create(output_directory, target_schema.clone(), target_settings)?;
merge_filtered_segments(&segments, target_settings, non_filter, output_directory)
}
/// Advanced: Merges a list of segments from different indices in a new index.
/// Additional you can provide a delete bitset for each segment to ignore doc_ids.
///
/// Returns `TantivyError` if the the indices list is empty or their
/// schemas don't match.
///
/// `output_directory`: is assumed to be empty.
///
/// # Warning
/// This function does NOT check or take the `IndexWriter` is running. It is not
/// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index.
#[doc(hidden)]
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
segments: &[Segment],
target_settings: IndexSettings,
filter_doc_ids: Vec<Option<AliveBitSet>>,
output_directory: T,
) -> crate::Result<Index> {
if segments.is_empty() {
// If there are no indices to merge, there is no need to do anything.
return Err(crate::TantivyError::InvalidArgument(
"No segments given to marge".to_string(),
));
}
let target_schema = segments[0].schema();
// let's check that all of the indices have the same schema
if segments
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
let mut merged_index = Index::create(
output_directory,
target_schema.clone(),
target_settings.clone(),
)?;
let merged_segment = merged_index.new_segment(); let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id(); let merged_segment_id = merged_segment.id();
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set( let merger: IndexMerger = IndexMerger::open(
merged_index.schema(), merged_index.schema(),
merged_index.settings().clone(), merged_index.settings().clone(),
segments, &segments[..],
filter_doc_ids,
)?; )?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?; let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let num_docs = merger.write(segment_serializer)?; let num_docs = merger.write(segment_serializer)?;
@@ -273,7 +225,7 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
); );
let index_meta = IndexMeta { let index_meta = IndexMeta {
index_settings: target_settings, // index_settings of all segments should be the same index_settings: indices[0].load_metas()?.index_settings, // index_settings of all segments should be the same
segments: vec![segment_meta], segments: vec![segment_meta],
schema: target_schema, schema: target_schema,
opstamp: 0u64, opstamp: 0u64,
@@ -354,39 +306,37 @@ impl SegmentUpdater {
*self.merge_policy.write().unwrap() = arc_merge_policy; *self.merge_policy.write().unwrap() = arc_merge_policy;
} }
async fn schedule_task< fn schedule_future<T: 'static + Send, F: Future<Output = crate::Result<T>> + 'static + Send>(
T: 'static + Send,
F: Future<Output = crate::Result<T>> + 'static + Send,
>(
&self, &self,
task: F, f: F,
) -> crate::Result<T> { ) -> impl Future<Output = crate::Result<T>> {
if !self.is_alive() {
return Err(crate::TantivyError::SystemError(
"Segment updater killed".to_string(),
));
}
let (sender, receiver) = oneshot::channel(); let (sender, receiver) = oneshot::channel();
self.pool.spawn_ok(async move { if self.is_alive() {
let task_result = task.await; self.pool.spawn_ok(async move {
let _ = sender.send(task_result); let _ = sender.send(f.await);
}); });
let task_result = receiver.await; } else {
task_result.unwrap_or_else(|_| { let _ = sender.send(Err(crate::TantivyError::SystemError(
"Segment updater killed".to_string(),
)));
}
receiver.unwrap_or_else(|_| {
let err_msg = let err_msg =
"A segment_updater future did not success. This should never happen.".to_string(); "A segment_updater future did not success. This should never happen.".to_string();
Err(crate::TantivyError::SystemError(err_msg)) Err(crate::TantivyError::SystemError(err_msg))
}) })
} }
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> { pub fn schedule_add_segment(
&self,
segment_entry: SegmentEntry,
) -> impl Future<Output = crate::Result<()>> {
let segment_updater = self.clone(); let segment_updater = self.clone();
self.schedule_task(async move { self.schedule_future(async move {
segment_updater.segment_manager.add_segment(segment_entry); segment_updater.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options().await; segment_updater.consider_merge_options().await;
Ok(()) Ok(())
}) })
.await
} }
/// Orders `SegmentManager` to remove all segments /// Orders `SegmentManager` to remove all segments
@@ -453,9 +403,11 @@ impl SegmentUpdater {
Ok(()) Ok(())
} }
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> { pub fn schedule_garbage_collect(
&self,
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
let garbage_collect_future = garbage_collect_files(self.clone()); let garbage_collect_future = garbage_collect_files(self.clone());
self.schedule_task(garbage_collect_future).await self.schedule_future(garbage_collect_future)
} }
/// List the files that are useful to the index. /// List the files that are useful to the index.
@@ -473,13 +425,13 @@ impl SegmentUpdater {
files files
} }
pub(crate) async fn schedule_commit( pub fn schedule_commit(
&self, &self,
opstamp: Opstamp, opstamp: Opstamp,
payload: Option<String>, payload: Option<String>,
) -> crate::Result<()> { ) -> impl Future<Output = crate::Result<()>> {
let segment_updater: SegmentUpdater = self.clone(); let segment_updater: SegmentUpdater = self.clone();
self.schedule_task(async move { self.schedule_future(async move {
let segment_entries = segment_updater.purge_deletes(opstamp)?; let segment_entries = segment_updater.purge_deletes(opstamp)?;
segment_updater.segment_manager.commit(segment_entries); segment_updater.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp, payload)?; segment_updater.save_metas(opstamp, payload)?;
@@ -487,7 +439,6 @@ impl SegmentUpdater {
segment_updater.consider_merge_options().await; segment_updater.consider_merge_options().await;
Ok(()) Ok(())
}) })
.await
} }
fn store_meta(&self, index_meta: &IndexMeta) { fn store_meta(&self, index_meta: &IndexMeta) {
@@ -562,7 +513,9 @@ impl SegmentUpdater {
e e
); );
// ... cancel merge // ... cancel merge
assert!(!cfg!(test), "Merge failed."); if cfg!(test) {
panic!("Merge failed.");
}
} }
} }
}); });
@@ -615,14 +568,14 @@ impl SegmentUpdater {
} }
} }
async fn end_merge( fn end_merge(
&self, &self,
merge_operation: MergeOperation, merge_operation: MergeOperation,
mut after_merge_segment_entry: SegmentEntry, mut after_merge_segment_entry: SegmentEntry,
) -> crate::Result<SegmentMeta> { ) -> impl Future<Output = crate::Result<SegmentMeta>> {
let segment_updater = self.clone(); let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone(); let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
self.schedule_task(async move { let end_merge_future = self.schedule_future(async move {
info!("End merge {:?}", after_merge_segment_entry.meta()); info!("End merge {:?}", after_merge_segment_entry.meta());
{ {
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
@@ -641,8 +594,9 @@ impl SegmentUpdater {
merge_operation.segment_ids(), merge_operation.segment_ids(),
advance_deletes_err advance_deletes_err
); );
assert!(!cfg!(test), "Merge failed."); if cfg!(test) {
panic!("Merge failed.");
}
// ... cancel merge // ... cancel merge
// `merge_operations` are tracked. As it is dropped, the // `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge. // the segment_ids will be available again for merge.
@@ -665,9 +619,8 @@ impl SegmentUpdater {
let _ = garbage_collect_files(segment_updater).await; let _ = garbage_collect_files(segment_updater).await;
Ok(()) Ok(())
}) });
.await?; end_merge_future.map_ok(|_| after_merge_segment_meta)
Ok(after_merge_segment_meta)
} }
/// Wait for current merging threads. /// Wait for current merging threads.
@@ -693,19 +646,11 @@ impl SegmentUpdater {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::merge_indices; use super::merge_segments;
use crate::collector::TopDocs;
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::fastfield::AliveBitSet;
use crate::indexer::merge_policy::tests::MergeWheneverPossible; use crate::indexer::merge_policy::tests::MergeWheneverPossible;
use crate::indexer::merger::IndexMerger;
use crate::indexer::segment_updater::merge_filtered_segments;
use crate::query::QueryParser;
use crate::schema::*; use crate::schema::*;
use crate::Directory;
use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::Segment;
#[test] #[test]
fn test_delete_during_merge() -> crate::Result<()> { fn test_delete_during_merge() -> crate::Result<()> {
@@ -718,19 +663,19 @@ mod tests {
index_writer.set_merge_policy(Box::new(MergeWheneverPossible)); index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
} }
index_writer.commit()?; index_writer.commit()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"))?; index_writer.add_document(doc!(text_field=>"d"));
} }
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"))?; index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"))?; index_writer.add_document(doc!(text_field=>"f"));
index_writer.commit()?; index_writer.commit()?;
let term = Term::from_field_text(text_field, "a"); let term = Term::from_field_text(text_field, "a");
@@ -748,50 +693,6 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn delete_all_docs_min() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_for_tests()?;
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"b"))?;
}
index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?;
// docs exist, should have at least 1 segment
assert!(!seg_ids.is_empty());
let term = Term::from_field_text(text_field, "a");
index_writer.delete_term(term);
index_writer.commit()?;
let term = Term::from_field_text(text_field, "b");
index_writer.delete_term(term);
index_writer.commit()?;
index_writer.wait_merging_threads()?;
let reader = index.reader()?;
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index.searchable_segment_ids()?;
assert!(seg_ids.is_empty());
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas()?.is_empty());
assert!(reader.searcher().segment_readers().is_empty());
Ok(())
}
#[test] #[test]
fn delete_all_docs() -> crate::Result<()> { fn delete_all_docs() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -802,19 +703,19 @@ mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
} }
index_writer.commit()?; index_writer.commit()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"))?; index_writer.add_document(doc!(text_field=>"d"));
} }
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"))?; index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"))?; index_writer.add_document(doc!(text_field=>"f"));
index_writer.commit()?; index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?; let seg_ids = index.searchable_segment_ids()?;
@@ -854,8 +755,8 @@ mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
} }
index_writer.commit()?; index_writer.commit()?;
@@ -881,22 +782,22 @@ mod tests {
// writing two segments // writing two segments
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 { for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"fizz"))?; index_writer.add_document(doc!(text_field=>"fizz"));
index_writer.add_document(doc!(text_field=>"buzz"))?; index_writer.add_document(doc!(text_field=>"buzz"));
} }
index_writer.commit()?; index_writer.commit()?;
for _ in 0..1000 { for _ in 0..1000 {
index_writer.add_document(doc!(text_field=>"foo"))?; index_writer.add_document(doc!(text_field=>"foo"));
index_writer.add_document(doc!(text_field=>"bar"))?; index_writer.add_document(doc!(text_field=>"bar"));
} }
index_writer.commit()?; index_writer.commit()?;
indices.push(index); indices.push(index);
} }
assert_eq!(indices.len(), 3); assert_eq!(indices.len(), 3);
let output_directory: Box<dyn Directory> = Box::new(RamDirectory::default()); let output_directory = RamDirectory::default();
let index = merge_indices(&indices, output_directory)?; let index = merge_segments(&indices, output_directory)?;
assert_eq!(index.schema(), schema); assert_eq!(index.schema(), schema);
let segments = index.searchable_segments()?; let segments = index.searchable_segments()?;
@@ -910,7 +811,7 @@ mod tests {
#[test] #[test]
fn test_merge_empty_indices_array() { fn test_merge_empty_indices_array() {
let merge_result = merge_indices(&[], RamDirectory::default()); let merge_result = merge_segments(&[], RamDirectory::default());
assert!(merge_result.is_err()); assert!(merge_result.is_err());
} }
@@ -921,7 +822,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text"))?; index_writer.add_document(doc!(text_field=>"some text"));
index_writer.commit()?; index_writer.commit()?;
index index
}; };
@@ -931,197 +832,15 @@ mod tests {
let body_field = schema_builder.add_text_field("body", TEXT); let body_field = schema_builder.add_text_field("body", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(body_field=>"some body"))?; index_writer.add_document(doc!(body_field=>"some body"));
index_writer.commit()?; index_writer.commit()?;
index index
}; };
// mismatched schema index list // mismatched schema index list
let result = merge_indices(&[first_index, second_index], RamDirectory::default()); let result = merge_segments(&[first_index, second_index], RamDirectory::default());
assert!(result.is_err()); assert!(result.is_err());
Ok(()) Ok(())
} }
#[test]
fn test_merge_filtered_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 1"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
segments.extend(second_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let filter_segment_1 = AliveBitSet::for_test_from_deleted_docs(&[1], 2);
let filter_segment_2 = AliveBitSet::for_test_from_deleted_docs(&[0], 2);
let filter_segments = vec![Some(filter_segment_1), Some(filter_segment_2)];
let merged_index = merge_filtered_segments(
&segments,
target_settings,
filter_segments,
RamDirectory::default(),
)?;
let segments = merged_index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 1);
Ok(())
}
#[test]
fn test_merge_single_filtered_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"test text"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[0], 4);
let filter_segments = vec![Some(filter_segment)];
let index = merge_filtered_segments(
&segments,
target_settings,
filter_segments,
RamDirectory::default(),
)?;
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 2);
let searcher = index.reader()?.searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("test"), vec![] as Vec<u32>);
assert_eq!(do_search("text"), vec![0, 1]);
}
Ok(())
}
#[test]
fn test_apply_doc_id_filter_in_merger() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 1"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
{
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[1], 4);
let filter_segments = vec![Some(filter_segment)];
let target_schema = segments[0].schema();
let merged_index = Index::create(
RamDirectory::default(),
target_schema.clone(),
target_settings.clone(),
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
assert_eq!(doc_ids_alive, vec![0, 2]);
}
{
let filter_segments = vec![None];
let target_schema = segments[0].schema();
let merged_index = Index::create(
RamDirectory::default(),
target_schema.clone(),
target_settings.clone(),
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
assert_eq!(doc_ids_alive, vec![0, 1, 2]);
}
Ok(())
}
} }

View File

@@ -2,6 +2,7 @@ use super::{
doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}, doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping},
operation::AddOperation, operation::AddOperation,
}; };
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::indexer::segment_serializer::SegmentSerializer; use crate::indexer::segment_serializer::SegmentSerializer;
use crate::postings::compute_table_size; use crate::postings::compute_table_size;
@@ -17,7 +18,6 @@ use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp; use crate::Opstamp;
use crate::{core::Segment, store::StoreWriter}; use crate::{core::Segment, store::StoreWriter};
use crate::{fastfield::FastFieldsWriter, schema::Type};
use crate::{DocId, SegmentComponent}; use crate::{DocId, SegmentComponent};
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
@@ -173,11 +173,18 @@ impl SegmentWriter {
let (term_buffer, multifield_postings) = let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings); (&mut self.term_buffer, &mut self.multifield_postings);
match *field_entry.field_type() { match *field_entry.field_type() {
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
term_buffer.set_field(Type::Facet, field); term_buffer.set_field(field);
for field_value in field_values { let facets =
let facet = field_value.value().facet().ok_or_else(make_schema_error)?; field_values
let facet_str = facet.encoded_str(); .iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
});
for facet_str in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
FacetTokenizer FacetTokenizer
.token_stream(facet_str) .token_stream(facet_str)
@@ -234,11 +241,12 @@ impl SegmentWriter {
term_buffer, term_buffer,
) )
}; };
self.fieldnorms_writer.record(doc_id, field, num_tokens); self.fieldnorms_writer.record(doc_id, field, num_tokens);
} }
FieldType::U64(_) => { FieldType::U64(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::U64, field_value.field()); term_buffer.set_field(field_value.field());
let u64_val = field_value let u64_val = field_value
.value() .value()
.u64_value() .u64_value()
@@ -249,7 +257,7 @@ impl SegmentWriter {
} }
FieldType::Date(_) => { FieldType::Date(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::Date, field_value.field()); term_buffer.set_field(field_value.field());
let date_val = field_value let date_val = field_value
.value() .value()
.date_value() .date_value()
@@ -260,7 +268,7 @@ impl SegmentWriter {
} }
FieldType::I64(_) => { FieldType::I64(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::I64, field_value.field()); term_buffer.set_field(field_value.field());
let i64_val = field_value let i64_val = field_value
.value() .value()
.i64_value() .i64_value()
@@ -271,7 +279,7 @@ impl SegmentWriter {
} }
FieldType::F64(_) => { FieldType::F64(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::F64, field_value.field()); term_buffer.set_field(field_value.field());
let f64_val = field_value let f64_val = field_value
.value() .value()
.f64_value() .f64_value()
@@ -282,7 +290,7 @@ impl SegmentWriter {
} }
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(Type::Bytes, field_value.field()); term_buffer.set_field(field_value.field());
let bytes = field_value let bytes = field_value
.value() .value()
.bytes_value() .bytes_value()

View File

@@ -10,8 +10,8 @@
)] )]
#![doc(test(attr(allow(unused_variables), deny(warnings))))] #![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)] #![warn(missing_docs)]
#![allow(clippy::len_without_is_empty)]
#![allow(clippy::return_self_not_must_use)] #![feature(async_closure)]
//! # `tantivy` //! # `tantivy`
//! //!
@@ -64,7 +64,7 @@
//! body => "He was an old man who fished alone in a skiff in \ //! body => "He was an old man who fished alone in a skiff in \
//! the Gulf Stream and he had gone eighty-four days \ //! the Gulf Stream and he had gone eighty-four days \
//! now without taking a fish." //! now without taking a fish."
//! ))?; //! ));
//! //!
//! // We need to call .commit() explicitly to force the //! // We need to call .commit() explicitly to force the
//! // index_writer to finish processing the documents in the queue, //! // index_writer to finish processing the documents in the queue,
@@ -105,7 +105,7 @@
//! A good place for you to get started is to check out //! A good place for you to get started is to check out
//! the example code ( //! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) / //! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/quickwit-inc/tantivy/blob/main/examples/basic_search.rs)) //! [source code](https://github.com/tantivy-search/tantivy/blob/main/examples/basic_search.rs))
#[cfg_attr(test, macro_use)] #[cfg_attr(test, macro_use)]
extern crate serde_json; extern crate serde_json;
@@ -128,6 +128,8 @@ mod macros;
pub use crate::error::TantivyError; pub use crate::error::TantivyError;
pub use chrono; pub use chrono;
pub const PKG_JS: &'static str = "./pkg/pool_exec.js"; // path to `wasm-bindgen`'s JS binding
/// Tantivy result. /// Tantivy result.
/// ///
/// Within tantivy, please avoid importing `Result` using `use crate::Result` /// Within tantivy, please avoid importing `Result` using `use crate::Result`
@@ -158,7 +160,7 @@ pub mod termdict;
mod reader; mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer}; pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
mod snippet; mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator}; pub use self::snippet::{Snippet, SnippetGenerator};
@@ -166,20 +168,17 @@ mod docset;
pub use self::docset::{DocSet, TERMINATED}; pub use self::docset::{DocSet, TERMINATED};
pub use crate::core::{Executor, SegmentComponent}; pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{ pub use crate::core::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
SearcherGeneration, Segment, SegmentId, SegmentMeta, SegmentId, SegmentMeta,
}; };
pub use crate::core::{InvertedIndexReader, SegmentReader}; pub use crate::core::{InvertedIndexReader, SegmentReader};
pub use crate::directory::Directory; pub use crate::directory::Directory;
pub use crate::indexer::demuxer::*; pub use crate::indexer::merge_segments;
pub use crate::indexer::merge_filtered_segments;
pub use crate::indexer::merge_indices;
pub use crate::indexer::operation::UserOperation; pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::{IndexWriter, PreparedCommit}; pub use crate::indexer::IndexWriter;
pub use crate::postings::Postings; pub use crate::postings::Postings;
pub use crate::reader::LeasedItem; pub use crate::reader::LeasedItem;
pub use crate::schema::{Document, Term}; pub use crate::schema::{Document, Term};
pub use census::{Inventory, TrackedObject};
pub use common::HasLen; pub use common::HasLen;
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use std::fmt; use std::fmt;
@@ -239,7 +238,6 @@ pub fn version_string() -> &'static str {
pub mod merge_policy { pub mod merge_policy {
pub use crate::indexer::DefaultMergePolicy; pub use crate::indexer::DefaultMergePolicy;
pub use crate::indexer::LogMergePolicy; pub use crate::indexer::LogMergePolicy;
pub use crate::indexer::MergeCandidate;
pub use crate::indexer::MergePolicy; pub use crate::indexer::MergePolicy;
pub use crate::indexer::NoMergePolicy; pub use crate::indexer::NoMergePolicy;
} }
@@ -382,22 +380,24 @@ pub mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?; let index = Index::create_from_tempdir(schema).unwrap();
// writing the segment
let mut index_writer = index.writer_for_tests()?;
{ {
let doc = doc!(text_field=>"af b"); // writing the segment
index_writer.add_document(doc)?; let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
} }
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc)?;
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc)?;
}
index_writer.commit()?;
Ok(()) Ok(())
} }
@@ -407,12 +407,12 @@ pub mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"))?; index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"c"))?; index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -434,7 +434,7 @@ pub mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.commit()?;
let index_reader = index.reader()?; let index_reader = index.reader()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
@@ -456,9 +456,9 @@ pub mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!())?; index_writer.add_document(doc!());
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -500,20 +500,20 @@ pub mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.add_document(doc!(text_field=>" a c"))?; index_writer.add_document(doc!(text_field=>" a c"));
// 2 // 2
index_writer.add_document(doc!(text_field=>" b c"))?; index_writer.add_document(doc!(text_field=>" b c"));
// 3 // 3
index_writer.add_document(doc!(text_field=>" b d"))?; index_writer.add_document(doc!(text_field=>" b d"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
// 4 // 4
index_writer.add_document(doc!(text_field=>" b c"))?; index_writer.add_document(doc!(text_field=>" b c"));
// 5 // 5
index_writer.add_document(doc!(text_field=>" a"))?; index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit()?; index_writer.commit()?;
} }
{ {
@@ -547,7 +547,7 @@ pub mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback()?;
@@ -583,7 +583,7 @@ pub mod tests {
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
@@ -633,7 +633,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64))?; index_writer.add_document(doc!(field=>1u64));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -657,7 +657,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val))?; index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -681,7 +681,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI; let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val))?; index_writer.add_document(doc!(value_field => val));
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -704,7 +704,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -727,14 +727,14 @@ pub mod tests {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"))?; index_writer.add_document(doc!(text_field=>"63"));
index_writer.add_document(doc!(text_field=>"70"))?; index_writer.add_document(doc!(text_field=>"70"));
index_writer.add_document(doc!(text_field=>"34"))?; index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc!(text_field=>"1"))?; index_writer.add_document(doc!(text_field=>"1"));
index_writer.add_document(doc!(text_field=>"38"))?; index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"))?; index_writer.add_document(doc!(text_field=>"33"));
index_writer.add_document(doc!(text_field=>"40"))?; index_writer.add_document(doc!(text_field=>"40"));
index_writer.add_document(doc!(text_field=>"17"))?; index_writer.add_document(doc!(text_field=>"17"));
index_writer.delete_term(Term::from_field_text(text_field, "38")); index_writer.delete_term(Term::from_field_text(text_field, "38"));
index_writer.delete_term(Term::from_field_text(text_field, "34")); index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?; index_writer.commit()?;
@@ -752,7 +752,7 @@ pub mod tests {
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?; index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.commit()?; index_writer.commit()?;
} }
{ {
@@ -784,9 +784,9 @@ pub mod tests {
let reader = index.reader()?; let reader = index.reader()?;
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"))?; index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"))?; index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?; index_writer.commit()?;
reader.reload()?; reader.reload()?;
@@ -848,9 +848,9 @@ pub mod tests {
assert_eq!(reader.searcher().num_docs(), 0u64); assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"))?; index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"))?; index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?; index_writer.commit()?;
reader.reload()?; reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 3u64); assert_eq!(reader.searcher().num_docs(), 3u64);
@@ -890,7 +890,7 @@ pub mod tests {
{ {
let document = let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64); doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document)?; index_writer.add_document(document);
index_writer.commit()?; index_writer.commit()?;
} }
let reader = index.reader()?; let reader = index.reader()?;
@@ -957,7 +957,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id))?; index_writer.add_document(doc!(id => doc_id));
} }
index_writer.commit()?; index_writer.commit()?;
@@ -974,7 +974,7 @@ pub mod tests {
index_writer.delete_term(Term::from_field_u64(id, doc_id)); index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit()?; index_writer.commit()?;
index_reader.reload()?; index_reader.reload()?;
index_writer.add_document(doc!(id => doc_id))?; index_writer.add_document(doc!(id => doc_id));
index_writer.commit()?; index_writer.commit()?;
index_reader.reload()?; index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
@@ -1009,8 +1009,8 @@ pub mod tests {
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?; let mut writer = index.writer(50_000_000)?;
for _ in 0..5000 { for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?; writer.add_document(doc!(body => "foo"));
writer.add_document(doc!(body => "boo"))?; writer.add_document(doc!(body => "boo"));
} }
writer.commit()?; writer.commit()?;
assert!(index.validate_checksum()?.is_empty()); assert!(index.validate_checksum()?.is_empty());

View File

@@ -1,5 +1,14 @@
use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
unsafe fn binary_search_step(ptr: *const u32, target: u32, half_size: isize) -> *const u32 {
let mid = ptr.offset(half_size);
if *mid < target {
mid.offset(1)
} else {
ptr
}
}
/// Search the first index containing an element greater or equal to /// Search the first index containing an element greater or equal to
/// the target. /// the target.
/// ///
@@ -21,16 +30,18 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
/// end of the last block for instance. /// end of the last block for instance.
/// - The target is assumed smaller or equal to the last element of the block. /// - The target is assumed smaller or equal to the last element of the block.
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize { pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
let mut start = 0; let start_ptr: *const u32 = &arr[0] as *const u32;
let mut len = arr.len(); unsafe {
for _ in 0..7 { let mut ptr = start_ptr;
len /= 2; ptr = binary_search_step(ptr, target, 63);
let pivot = unsafe { *arr.get_unchecked(start + len - 1) }; ptr = binary_search_step(ptr, target, 31);
if pivot < target { ptr = binary_search_step(ptr, target, 15);
start += len; ptr = binary_search_step(ptr, target, 7);
} ptr = binary_search_step(ptr, target, 3);
ptr = binary_search_step(ptr, target, 1);
let extra = if *ptr < target { 1 } else { 0 };
(ptr.offset_from(start_ptr) as usize) + extra
} }
start
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -393,8 +393,8 @@ mod tests {
} }
#[test] #[test]
fn test_block_segment_postings() -> crate::Result<()> { fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?; let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32; let mut offset: u32 = 0u32;
// checking that the `doc_freq` is correct // checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000); assert_eq!(block_segments.doc_freq(), 100_000);
@@ -409,17 +409,16 @@ mod tests {
offset += block.len() as u32; offset += block.len() as u32;
block_segments.advance(); block_segments.advance();
} }
Ok(())
} }
#[test] #[test]
fn test_skip_right_at_new_block() -> crate::Result<()> { fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>(); let mut doc_ids = (0..128).collect::<Vec<u32>>();
// 128 is missing // 128 is missing
doc_ids.push(129); doc_ids.push(129);
doc_ids.push(130); doc_ids.push(130);
{ {
let block_segments = build_block_postings(&doc_ids)?; let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None); let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(128), 129); assert_eq!(docset.seek(128), 129);
assert_eq!(docset.doc(), 129); assert_eq!(docset.doc(), 129);
@@ -428,7 +427,7 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED); assert_eq!(docset.advance(), TERMINATED);
} }
{ {
let block_segments = build_block_postings(&doc_ids).unwrap(); let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None); let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(129), 129); assert_eq!(docset.seek(129), 129);
assert_eq!(docset.doc(), 129); assert_eq!(docset.doc(), 129);
@@ -437,47 +436,46 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED); assert_eq!(docset.advance(), TERMINATED);
} }
{ {
let block_segments = build_block_postings(&doc_ids)?; let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None); let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.doc(), 0); assert_eq!(docset.doc(), 0);
assert_eq!(docset.seek(131), TERMINATED); assert_eq!(docset.seek(131), TERMINATED);
assert_eq!(docset.doc(), TERMINATED); assert_eq!(docset.doc(), TERMINATED);
} }
Ok(())
} }
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> { fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32; let mut last_doc = 0u32;
for &doc in docs { for &doc in docs {
for _ in last_doc..doc { for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64))?; index_writer.add_document(doc!(int_field=>1u64));
} }
index_writer.add_document(doc!(int_field=>0u64))?; index_writer.add_document(doc!(int_field=>0u64));
last_doc = doc + 1; last_doc = doc + 1;
} }
index_writer.commit()?; index_writer.commit().unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field).unwrap(); let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let term = Term::from_field_u64(int_field, 0u64); let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term)?.unwrap(); let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
let block_postings = inverted_index inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
Ok(block_postings) .unwrap()
} }
#[test] #[test]
fn test_block_segment_postings_seek() -> crate::Result<()> { fn test_block_segment_postings_seek() {
let mut docs = vec![0]; let mut docs = vec![0];
for i in 0..1300 { for i in 0..1300 {
docs.push((i * i / 100) + i); docs.push((i * i / 100) + i);
} }
let mut block_postings = build_block_postings(&docs[..])?; let mut block_postings = build_block_postings(&docs[..]);
for i in &[0, 424, 10000] { for i in &[0, 424, 10000] {
block_postings.seek(*i); block_postings.seek(*i);
let docs = block_postings.docs(); let docs = block_postings.docs();
@@ -486,7 +484,6 @@ mod tests {
} }
block_postings.seek(100_000); block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED); assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
Ok(())
} }
#[test] #[test]
@@ -500,7 +497,7 @@ mod tests {
// the other containing odd numbers. // the other containing odd numbers.
for i in 0..6 { for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64); let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();

View File

@@ -47,6 +47,7 @@ pub mod tests {
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormReader;
use crate::indexer::operation::AddOperation; use crate::indexer::operation::AddOperation;
use crate::indexer::SegmentWriter; use crate::indexer::SegmentWriter;
use crate::merge_policy::NoMergePolicy;
use crate::query::Scorer; use crate::query::Scorer;
use crate::schema::{Field, TextOptions}; use crate::schema::{Field, TextOptions};
use crate::schema::{IndexRecordOption, TextFieldIndexing}; use crate::schema::{IndexRecordOption, TextFieldIndexing};
@@ -86,12 +87,12 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#))?; index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?; index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 { for _ in 0..1_000 {
index_writer.add_document(doc!(title => r#"abc abc abc"#))?; index_writer.add_document(doc!(title => r#"abc abc abc"#));
} }
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?; index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
@@ -152,68 +153,50 @@ pub mod tests {
Ok(()) Ok(())
} }
#[test]
pub fn test_index_max_length_token() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes[..], ok_token_text.as_bytes());
Ok(())
}
#[test] #[test]
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> { pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1); let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
exceeding_token_text.push_str(" hello"); exceeding_token_text.push_str(" hello");
index_writer.add_document(doc!(text_field=>exceeding_token_text))?; let mut schema_builder = Schema::builder();
index_writer.commit()?; let text_options = TextOptions::default().set_indexing_options(
reader.reload()?; TextFieldIndexing::default()
let searcher = reader.searcher(); .set_index_option(IndexRecordOption::WithFreqsAndPositions)
let segment_reader = searcher.segment_reader(0u32); .set_tokenizer("simple_no_truncation"),
let inverted_index = segment_reader.inverted_index(text_field)?; );
assert_eq!(inverted_index.terms().num_terms(), 1); let text_field = schema_builder.add_text_field("text", text_options);
let mut bytes = vec![]; let schema = schema_builder.build();
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?); let index = Index::create_in_ram(schema);
assert_eq!(&bytes, b"hello"); index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes, b"hello");
}
{
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(1u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes[..], ok_token_text.as_bytes());
}
Ok(()) Ok(())
} }
@@ -332,13 +315,13 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c"))?; index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c"))?; index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader let mut postings = segment_reader
.inverted_index(text_field)? .inverted_index(text_field)?
@@ -367,7 +350,7 @@ pub mod tests {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
for i in 0u64..num_docs as u64 { for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64); let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -617,7 +600,7 @@ mod bench {
doc.add_text(text_field, "c"); doc.add_text(text_field, "c");
} }
doc.add_text(text_field, "d"); doc.add_text(text_field, "d");
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc);
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }

View File

@@ -5,8 +5,8 @@ use crate::postings::recorder::{
}; };
use crate::postings::UnorderedTermId; use crate::postings::UnorderedTermId;
use crate::postings::{FieldSerializer, InvertedIndexSerializer}; use crate::postings::{FieldSerializer, InvertedIndexSerializer};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term}; use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
use crate::schema::{IndexRecordOption, Type};
use crate::termdict::TermOrdinal; use crate::termdict::TermOrdinal;
use crate::tokenizer::TokenStream; use crate::tokenizer::TokenStream;
use crate::tokenizer::{Token, MAX_TOKEN_LEN}; use crate::tokenizer::{Token, MAX_TOKEN_LEN};
@@ -33,13 +33,15 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed() SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
} }
}) })
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed), .unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) FieldType::U64(_)
| FieldType::I64(_) | FieldType::I64(_)
| FieldType::F64(_) | FieldType::F64(_)
| FieldType::Date(_) | FieldType::Date(_)
| FieldType::Bytes(_) | FieldType::Bytes(_)
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(), | FieldType::HierarchicalFacet(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
} }
} }
@@ -51,11 +53,11 @@ pub struct MultiFieldPostingsWriter {
} }
fn make_field_partition( fn make_field_partition(
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)], term_offsets: &[(&[u8], Addr, UnorderedTermId)],
) -> Vec<(Field, Range<usize>)> { ) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets let term_offsets_it = term_offsets
.iter() .iter()
.map(|(term, _, _)| term.field()) .map(|(key, _, _)| Term::wrap(key).field())
.enumerate(); .enumerate();
let mut prev_field_opt = None; let mut prev_field_opt = None;
let mut fields = vec![]; let mut fields = vec![];
@@ -130,10 +132,10 @@ impl MultiFieldPostingsWriter {
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> { ) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> = let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
Vec::with_capacity(self.term_index.len()); Vec::with_capacity(self.term_index.len());
term_offsets.extend(self.term_index.iter()); term_offsets.extend(self.term_index.iter());
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone()); term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> = let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new(); HashMap::new();
@@ -144,7 +146,7 @@ impl MultiFieldPostingsWriter {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() { match *field_entry.field_type() {
FieldType::Str(_) | FieldType::Facet(_) => { FieldType::Str(_) | FieldType::HierarchicalFacet(_) => {
// populating the (unordered term ord) -> (ordered term ord) mapping // populating the (unordered term ord) -> (ordered term ord) mapping
// for the field. // for the field.
let unordered_term_ids = term_offsets[byte_offsets.clone()] let unordered_term_ids = term_offsets[byte_offsets.clone()]
@@ -208,7 +210,7 @@ pub trait PostingsWriter {
/// The actual serialization format is handled by the `PostingsSerializer`. /// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize( fn serialize(
&self, &self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], term_addrs: &[(&[u8], Addr, UnorderedTermId)],
serializer: &mut FieldSerializer<'_>, serializer: &mut FieldSerializer<'_>,
term_heap: &MemoryArena, term_heap: &MemoryArena,
heap: &MemoryArena, heap: &MemoryArena,
@@ -225,7 +227,7 @@ pub trait PostingsWriter {
heap: &mut MemoryArena, heap: &mut MemoryArena,
term_buffer: &mut Term, term_buffer: &mut Term,
) -> u32 { ) -> u32 {
term_buffer.set_field(Type::Str, field); term_buffer.set_field(field);
let mut sink = |token: &Token| { let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16. // We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN { if token.text.len() <= MAX_TOKEN_LEN {
@@ -279,7 +281,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
) -> UnorderedTermId { ) -> UnorderedTermId {
debug_assert!(term.as_slice().len() >= 4); debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1; self.total_num_tokens += 1;
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| { term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder { if let Some(mut recorder) = opt_recorder {
let current_doc = recorder.current_doc(); let current_doc = recorder.current_doc();
if current_doc != doc { if current_doc != doc {
@@ -299,17 +301,17 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
fn serialize( fn serialize(
&self, &self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], term_addrs: &[(&[u8], Addr, UnorderedTermId)],
serializer: &mut FieldSerializer<'_>, serializer: &mut FieldSerializer<'_>,
termdict_heap: &MemoryArena, termdict_heap: &MemoryArena,
heap: &MemoryArena, heap: &MemoryArena,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
let mut buffer_lender = BufferLender::default(); let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs { for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = termdict_heap.read(*addr); let recorder: Rec = termdict_heap.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term.value_bytes(), term_doc_freq)?; serializer.new_term(&term_bytes[4..], term_doc_freq)?;
recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map); recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map);
serializer.close_term()?; serializer.close_term()?;
} }

View File

@@ -13,7 +13,6 @@ use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{DocId, Score}; use crate::{DocId, Score};
use common::CountingWriter; use common::CountingWriter;
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
use fail::fail_point;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::io::{self, Write}; use std::io::{self, Write};
@@ -213,9 +212,6 @@ impl<'a> FieldSerializer<'a> {
/// If the current block is incomplete, it need to be encoded /// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding. /// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> { pub fn close_term(&mut self) -> io::Result<()> {
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{:?}", msg)))
});
if self.term_open { if self.term_open {
self.postings_serializer self.postings_serializer
.close_term(self.current_term_info.doc_freq)?; .close_term(self.current_term_info.doc_freq)?;
@@ -308,8 +304,10 @@ pub struct PostingsSerializer<W: Write> {
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<Bm25Weight>, bm25_weight: Option<Bm25Weight>,
num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment. avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information. // this value is used to compute the block wand information.
} }
impl<W: Write> PostingsSerializer<W> { impl<W: Write> PostingsSerializer<W> {
@@ -319,6 +317,10 @@ impl<W: Write> PostingsSerializer<W> {
mode: IndexRecordOption, mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> { ) -> PostingsSerializer<W> {
let num_docs = fieldnorm_reader
.as_ref()
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
.unwrap_or(0u32);
PostingsSerializer { PostingsSerializer {
output_write: CountingWriter::wrap(write), output_write: CountingWriter::wrap(write),
@@ -333,33 +335,21 @@ impl<W: Write> PostingsSerializer<W> {
fieldnorm_reader, fieldnorm_reader,
bm25_weight: None, bm25_weight: None,
num_docs,
avg_fieldnorm, avg_fieldnorm,
} }
} }
pub fn new_term(&mut self, term_doc_freq: u32) { pub fn new_term(&mut self, term_doc_freq: u32) {
self.bm25_weight = None; if self.mode.has_freq() && self.num_docs > 0 {
let bm25_weight = Bm25Weight::for_one_term(
if !self.mode.has_freq() { term_doc_freq as u64,
return; self.num_docs as u64,
self.avg_fieldnorm,
);
self.bm25_weight = Some(bm25_weight);
} }
let num_docs_in_segment: u64 =
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
fieldnorm_reader.num_docs() as u64
} else {
return;
};
if num_docs_in_segment == 0 {
return;
}
self.bm25_weight = Some(Bm25Weight::for_one_term(
term_doc_freq as u64,
num_docs_in_segment,
self.avg_fieldnorm,
));
} }
fn write_block(&mut self) { fn write_block(&mut self) {

View File

@@ -186,6 +186,7 @@ mod tests {
use super::*; use super::*;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
#[test]
#[test] #[test]
fn test_stack() { fn test_stack() {
let mut heap = MemoryArena::new(); let mut heap = MemoryArena::new();

View File

@@ -3,7 +3,6 @@ use murmurhash32::murmurhash2;
use super::{Addr, MemoryArena}; use super::{Addr, MemoryArena};
use crate::postings::stacker::memory_arena::store; use crate::postings::stacker::memory_arena::store;
use crate::postings::UnorderedTermId; use crate::postings::UnorderedTermId;
use crate::Term;
use byteorder::{ByteOrder, NativeEndian}; use byteorder::{ByteOrder, NativeEndian};
use std::iter; use std::iter;
use std::mem; use std::mem;
@@ -82,13 +81,13 @@ pub struct Iter<'a> {
} }
impl<'a> Iterator for Iter<'a> { impl<'a> Iterator for Iter<'a> {
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId); type Item = (&'a [u8], Addr, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| { self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket]; let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr); let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
(Term::wrap(key), offset, kv.unordered_term_id) (key, offset, kv.unordered_term_id)
}) })
} }
} }
@@ -190,19 +189,21 @@ impl TermHashMap {
/// will be in charge of returning a default value. /// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed /// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`. /// `Some(previous_value)`.
pub fn mutate_or_create<V, TMutator>( pub fn mutate_or_create<S, V, TMutator>(
&mut self, &mut self,
key: &[u8], key: S,
mut updater: TMutator, mut updater: TMutator,
) -> UnorderedTermId ) -> UnorderedTermId
where where
S: AsRef<[u8]>,
V: Copy + 'static, V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V, TMutator: FnMut(Option<V>) -> V,
{ {
if self.is_saturated() { if self.is_saturated() {
self.resize(); self.resize();
} }
let hash = murmurhash2(key); let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2(key.as_ref());
let mut probe = self.probe(hash); let mut probe = self.probe(hash);
loop { loop {
let bucket = probe.next_probe(); let bucket = probe.next_probe();
@@ -210,18 +211,21 @@ impl TermHashMap {
if kv.is_empty() { if kv.is_empty() {
// The key does not exists yet. // The key does not exists yet.
let val = updater(None); let val = updater(None);
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>(); let num_bytes =
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
let key_addr = self.heap.allocate_space(num_bytes); let key_addr = self.heap.allocate_space(num_bytes);
{ {
let data = self.heap.slice_mut(key_addr, num_bytes); let data = self.heap.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key.len() as u16); NativeEndian::write_u16(data, key_bytes.len() as u16);
let stop = 2 + key.len(); let stop = 2 + key_bytes.len();
data[2..stop].copy_from_slice(key); data[2..stop].copy_from_slice(key_bytes);
store(&mut data[stop..], val); store(&mut data[stop..], val);
} }
return self.set_bucket(hash, key_addr, bucket); return self.set_bucket(hash, key_addr, bucket);
} else if kv.hash == hash { } else if kv.hash == hash {
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) { if let Some(val_addr) =
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
{
let v = self.heap.read(val_addr); let v = self.heap.read(val_addr);
let new_v = updater(Some(v)); let new_v = updater(Some(v));
self.heap.write_at(val_addr, new_v); self.heap.write_at(val_addr, new_v);
@@ -241,18 +245,25 @@ mod tests {
#[test] #[test]
fn test_hash_map() { fn test_hash_map() {
let mut hash_map: TermHashMap = TermHashMap::new(18); let mut hash_map: TermHashMap = TermHashMap::new(18);
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| { {
assert_eq!(opt_val, None); hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
3u32 assert_eq!(opt_val, None);
}); 3u32
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| { });
assert_eq!(opt_val, None); }
4u32 {
}); hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| { assert_eq!(opt_val, None);
assert_eq!(opt_val, Some(3u32)); 4u32
5u32 });
}); }
{
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, Some(3u32));
5u32
});
}
let mut vanilla_hash_map = HashMap::new(); let mut vanilla_hash_map = HashMap::new();
let iter_values = hash_map.iter(); let iter_values = hash_map.iter();
for (key, addr, _) in iter_values { for (key, addr, _) in iter_values {

View File

@@ -78,29 +78,29 @@ mod tests {
use crate::schema::{Schema, TEXT}; use crate::schema::{Schema, TEXT};
use crate::Index; use crate::Index;
fn create_test_index() -> crate::Result<Index> { fn create_test_index() -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_text_field("text", TEXT); let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>"aaa"))?; index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb"))?; index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit()?; index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc"))?; index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit()?; index_writer.commit().unwrap();
Ok(index) index
} }
#[test] #[test]
fn test_all_query() -> crate::Result<()> { fn test_all_query() {
let index = create_test_index()?; let index = create_test_index();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(&searcher, false)?; let weight = AllQuery.weight(&searcher, false).unwrap();
{ {
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0)?; let mut scorer = weight.scorer(reader, 1.0).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), 1u32); assert_eq!(scorer.advance(), 1u32);
assert_eq!(scorer.doc(), 1u32); assert_eq!(scorer.doc(), 1u32);
@@ -108,30 +108,28 @@ mod tests {
} }
{ {
let reader = searcher.segment_reader(1); let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader, 1.0)?; let mut scorer = weight.scorer(reader, 1.0).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), TERMINATED); assert_eq!(scorer.advance(), TERMINATED);
} }
Ok(())
} }
#[test] #[test]
fn test_all_query_with_boost() -> crate::Result<()> { fn test_all_query_with_boost() {
let index = create_test_index()?; let index = create_test_index();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(&searcher, false)?; let weight = AllQuery.weight(&searcher, false).unwrap();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
{ {
let mut scorer = weight.scorer(reader, 2.0)?; let mut scorer = weight.scorer(reader, 2.0).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 2.0); assert_eq!(scorer.score(), 2.0);
} }
{ {
let mut scorer = weight.scorer(reader, 1.5)?; let mut scorer = weight.scorer(reader, 1.5).unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.5); assert_eq!(scorer.score(), 1.5);
} }
Ok(())
} }
} }

View File

@@ -92,16 +92,16 @@ mod tests {
use crate::Index; use crate::Index;
use tantivy_fst::Automaton; use tantivy_fst::Automaton;
fn create_index() -> crate::Result<Index> { fn create_index() -> Index {
let mut schema = Schema::builder(); let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING); let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build()); let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc"))?; index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd"))?; index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd"))?; index_writer.add_document(doc!(title=>"abcd"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
Ok(index) index
} }
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
@@ -140,32 +140,34 @@ mod tests {
} }
#[test] #[test]
fn test_automaton_weight() -> crate::Result<()> { fn test_automaton_weight() {
let index = create_index()?; let index = create_index();
let field = index.schema().get_field("title").unwrap(); let field = index.schema().get_field("title").unwrap();
let automaton_weight = AutomatonWeight::new(field, PrefixedByA); let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.0)?; let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.0); assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), 2u32); assert_eq!(scorer.advance(), 2u32);
assert_eq!(scorer.doc(), 2u32); assert_eq!(scorer.doc(), 2u32);
assert_eq!(scorer.score(), 1.0); assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), TERMINATED); assert_eq!(scorer.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_automaton_weight_boost() -> crate::Result<()> { fn test_automaton_weight_boost() {
let index = create_index()?; let index = create_index();
let field = index.schema().get_field("title").unwrap(); let field = index.schema().get_field("title").unwrap();
let automaton_weight = AutomatonWeight::new(field, PrefixedByA); let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.32)?; let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.32)
.unwrap();
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.32); assert_eq!(scorer.score(), 1.32);
Ok(())
} }
} }

View File

@@ -42,39 +42,27 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc)) Some((before_pivot_len, pivot_len, pivot_doc))
} }
/// Advance the scorer with best score among the scorers[..pivot_len] to // Before and after calling this method, scorers need to be sorted by their `.doc()`.
/// the next doc candidate defined by the min of `last_doc_in_block + 1` for
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer( fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>, scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize, pivot_len: usize,
) { ) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1; let mut scorer_to_seek = pivot_len - 1;
let mut global_max_score = scorers[scorer_to_seek].max_score; let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block();
for scorer_ord in (0..pivot_len - 1).rev() { for scorer_ord in (0..pivot_len - 1).rev() {
let scorer = &scorers[scorer_ord]; let scorer = &scorers[scorer_ord];
if scorer.last_doc_in_block() <= doc_to_seek_after { if scorer.last_doc_in_block() <= doc_to_seek_after {
doc_to_seek_after = scorer.last_doc_in_block(); doc_to_seek_after = scorer.last_doc_in_block();
}
if scorers[scorer_ord].max_score > global_max_score {
global_max_score = scorers[scorer_ord].max_score;
scorer_to_seek = scorer_ord; scorer_to_seek = scorer_ord;
} }
} }
// Add +1 to go to the next block unless we are already at the end.
if doc_to_seek_after != TERMINATED {
doc_to_seek_after += 1;
}
for scorer in &scorers[pivot_len..] { for scorer in &scorers[pivot_len..] {
if scorer.doc() <= doc_to_seek_after { if scorer.doc() <= doc_to_seek_after {
doc_to_seek_after = scorer.doc(); doc_to_seek_after = scorer.doc();
} }
} }
scorers[scorer_to_seek].seek(doc_to_seek_after); scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek); restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
} }
@@ -142,9 +130,6 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>,
term_scorers.sort_by_key(|scorer| scorer.doc()); term_scorers.sort_by_key(|scorer| scorer.doc());
} }
/// Implements the WAND (Weak AND) algorithm for dynamic pruning
/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes".
/// Link: http://engineering.nyu.edu/~suel/papers/bmw.pdf
pub fn block_wand( pub fn block_wand(
mut scorers: Vec<TermScorer>, mut scorers: Vec<TermScorer>,
mut threshold: Score, mut threshold: Score,
@@ -202,7 +187,6 @@ pub fn block_wand(
.iter_mut() .iter_mut()
.map(|scorer| scorer.score()) .map(|scorer| scorer.score())
.sum(); .sum();
if score > threshold { if score > threshold {
threshold = callback(pivot_doc, score); threshold = callback(pivot_doc, score);
} }
@@ -211,56 +195,6 @@ pub fn block_wand(
} }
} }
/// Specialized version of [`block_wand`] for a single scorer.
/// In this case, the algorithm is simple and readable and faster (~ x3)
/// than the generic algorithm.
/// The algorithm behaves as follows:
/// - While we don't hit the end of the docset:
/// - While the block max score is under the `threshold`, go to the
/// next block.
/// - On a block, advance until the end and execute `callback``
/// when the doc score is greater or equal to the `threshold`.
pub fn block_wand_single_scorer(
mut scorer: TermScorer,
mut threshold: Score,
callback: &mut dyn FnMut(u32, Score) -> Score,
) {
let mut doc = scorer.doc();
loop {
// We position the scorer on a block that can reach
// the threshold.
while scorer.block_max_score() < threshold {
let last_doc_in_block = scorer.last_doc_in_block();
if last_doc_in_block == TERMINATED {
return;
}
doc = last_doc_in_block + 1;
scorer.shallow_seek(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
if doc == TERMINATED {
break;
}
loop {
let score = scorer.score();
if score > threshold {
threshold = callback(doc, score);
}
debug_assert!(doc <= scorer.last_doc_in_block());
if doc == scorer.last_doc_in_block() {
break;
}
doc = scorer.advance();
if doc == TERMINATED {
return;
}
}
doc += 1;
scorer.shallow_seek(doc);
}
}
struct TermScorerWithMaxScore<'a> { struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer, scorer: &'a mut TermScorer,
max_score: Score, max_score: Score,
@@ -338,14 +272,13 @@ mod tests {
} }
fn compute_checkpoints_for_each_pruning( fn compute_checkpoints_for_each_pruning(
mut term_scorers: Vec<TermScorer>, term_scorers: Vec<TermScorer>,
n: usize, n: usize,
) -> Vec<(DocId, Score)> { ) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n); let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new(); let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut limit: Score = 0.0; let mut limit: Score = 0.0;
super::block_wand(term_scorers, Score::MIN, &mut |doc, score| {
let callback = &mut |doc, score| {
heap.push(Float(score)); heap.push(Float(score));
if heap.len() > n { if heap.len() > n {
heap.pop().unwrap(); heap.pop().unwrap();
@@ -357,14 +290,7 @@ mod tests {
checkpoints.push((doc, score)); checkpoints.push((doc, score));
} }
limit limit
}; });
if term_scorers.len() == 1 {
let scorer = term_scorers.pop().unwrap();
super::block_wand_single_scorer(scorer, Score::MIN, callback);
} else {
super::block_wand(term_scorers, Score::MIN, callback);
}
checkpoints checkpoints
} }
@@ -498,14 +424,6 @@ mod tests {
} }
} }
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_block_wand_single_term_scorer((posting_lists, fieldnorms) in gen_term_scorers(1)) {
test_block_wand_aux(&posting_lists[..], &fieldnorms[..]);
}
}
#[test] #[test]
fn test_fn_reproduce_proptest() { fn test_fn_reproduce_proptest() {
let postings_lists = &[ let postings_lists = &[

View File

@@ -41,22 +41,22 @@ use std::collections::BTreeMap;
/// let mut index_writer = index.writer(3_000_000)?; /// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Name of the Wind", /// title => "The Name of the Wind",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of Muadib", /// title => "The Diary of Muadib",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "A Dairy Cow", /// title => "A Dairy Cow",
/// body => "hidden", /// body => "hidden",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "A Dairy Cow", /// title => "A Dairy Cow",
/// body => "found", /// body => "found",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl", /// title => "The Diary of a Young Girl",
/// ))?; /// ));
/// index_writer.commit()?; /// index_writer.commit().unwrap();
/// } /// }
/// ///
/// let reader = index.reader()?; /// let reader = index.reader()?;
@@ -217,11 +217,11 @@ mod tests {
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?; let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(text=>"b c"))?; writer.add_document(doc!(text=>"b c"));
writer.add_document(doc!(text=>"a c"))?; writer.add_document(doc!(text=>"a c"));
writer.add_document(doc!(text=>"a b"))?; writer.add_document(doc!(text=>"a b"));
writer.add_document(doc!(text=>"a d"))?; writer.add_document(doc!(text=>"a d"));
writer.commit()?; writer.commit()?;
Ok(index) Ok(index)
} }

View File

@@ -3,7 +3,6 @@ mod boolean_query;
mod boolean_weight; mod boolean_weight;
pub(crate) use self::block_wand::block_wand; pub(crate) use self::block_wand::block_wand;
pub(crate) use self::block_wand::block_wand_single_scorer;
pub use self::boolean_query::BooleanQuery; pub use self::boolean_query::BooleanQuery;
#[cfg(test)] #[cfg(test)]
@@ -26,75 +25,72 @@ mod tests {
use crate::Index; use crate::Index;
use crate::{DocAddress, DocId, Score}; use crate::{DocAddress, DocId, Score};
fn aux_test_helper() -> crate::Result<(Index, Field)> { fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"))?; index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"))?; index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"))?; index_writer.add_document(doc!(text_field => "b c"));
index_writer.add_document(doc!(text_field => "a b c d"))?; index_writer.add_document(doc!(text_field => "a b c d"));
index_writer.add_document(doc!(text_field => "d"))?; index_writer.add_document(doc!(text_field => "d"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
Ok((index, text_field)) (index, text_field)
} }
#[test] #[test]
pub fn test_boolean_non_all_term_disjunction() -> crate::Result<()> { pub fn test_boolean_non_all_term_disjunction() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("(+a +b) d")?; let query = query_parser.parse_query("(+a +b) d").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
assert_eq!(query.count(&searcher)?, 3); assert_eq!(query.count(&searcher).unwrap(), 3);
Ok(())
} }
#[test] #[test]
pub fn test_boolean_single_must_clause() -> crate::Result<()> { pub fn test_boolean_single_must_clause() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("+a")?; let query = query_parser.parse_query("+a").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
Ok(())
} }
#[test] #[test]
pub fn test_boolean_termonly_intersection() -> crate::Result<()> { pub fn test_boolean_termonly_intersection() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
{ {
let query = query_parser.parse_query("+a +b +c")?; let query = query_parser.parse_query("+a +b +c").unwrap();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<Intersection<TermScorer>>()); assert!(scorer.is::<Intersection<TermScorer>>());
} }
{ {
let query = query_parser.parse_query("+a +(b c)")?; let query = query_parser.parse_query("+a +(b c)").unwrap();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>()); assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_reqopt() -> crate::Result<()> { pub fn test_boolean_reqopt() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]); let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
{ {
let query = query_parser.parse_query("+a b")?; let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, true)?; let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<RequiredOptionalScorer< assert!(scorer.is::<RequiredOptionalScorer<
Box<dyn Scorer>, Box<dyn Scorer>,
Box<dyn Scorer>, Box<dyn Scorer>,
@@ -102,17 +98,16 @@ mod tests {
>>()); >>());
} }
{ {
let query = query_parser.parse_query("+a b")?; let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, false)?; let weight = query.weight(&searcher, false).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_query() -> crate::Result<()> { pub fn test_boolean_query() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| { let make_term_query = |text: &str| {
let term_query = TermQuery::new( let term_query = TermQuery::new(
@@ -123,7 +118,7 @@ mod tests {
query query
}; };
let reader = index.reader()?; let reader = index.reader().unwrap();
let matching_docs = |boolean_query: &dyn Query| { let matching_docs = |boolean_query: &dyn Query| {
reader reader
@@ -170,12 +165,11 @@ mod tests {
let boolean_query = BooleanQuery::new(vec![(Occur::MustNot, make_term_query("d"))]); let boolean_query = BooleanQuery::new(vec![(Occur::MustNot, make_term_query("d"))]);
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new()); assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_query_two_excluded() -> crate::Result<()> { pub fn test_boolean_query_two_excluded() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| { let make_term_query = |text: &str| {
let term_query = TermQuery::new( let term_query = TermQuery::new(
@@ -186,7 +180,7 @@ mod tests {
query query
}; };
let reader = index.reader()?; let reader = index.reader().unwrap();
let matching_topdocs = |query: &dyn Query| { let matching_topdocs = |query: &dyn Query| {
reader reader
@@ -219,21 +213,20 @@ mod tests {
assert_eq!(top_doc, DocAddress::new(0, 4)); assert_eq!(top_doc, DocAddress::new(0, 4));
assert_eq!(top_score, score_doc_4); assert_eq!(top_score, score_doc_4);
} }
Ok(())
} }
#[test] #[test]
pub fn test_boolean_query_with_weight() -> crate::Result<()> { pub fn test_boolean_query_with_weight() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"))?; index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"))?; index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"))?; index_writer.add_document(doc!(text_field => "b c"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let term_a: Box<dyn Query> = Box::new(TermQuery::new( let term_a: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "a"),
@@ -249,21 +242,24 @@ mod tests {
BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]); BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let boolean_weight = boolean_query.weight(&searcher, true).unwrap(); let boolean_weight = boolean_query.weight(&searcher, true).unwrap();
{ {
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 1.0)?; let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32); assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445); assert_nearly_equals!(boolean_scorer.score(), 0.84163445);
} }
{ {
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 2.0)?; let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32); assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689); assert_nearly_equals!(boolean_scorer.score(), 1.6832689);
} }
Ok(())
} }
#[test] #[test]
pub fn test_intersection_score() -> crate::Result<()> { pub fn test_intersection_score() {
let (index, text_field) = aux_test_helper()?; let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| { let make_term_query = |text: &str| {
let term_query = TermQuery::new( let term_query = TermQuery::new(
@@ -273,7 +269,7 @@ mod tests {
let query: Box<dyn Query> = Box::new(term_query); let query: Box<dyn Query> = Box::new(term_query);
query query
}; };
let reader = index.reader()?; let reader = index.reader().unwrap();
let score_docs = |boolean_query: &dyn Query| { let score_docs = |boolean_query: &dyn Query| {
let fruit = reader let fruit = reader
.searcher() .searcher()
@@ -291,7 +287,6 @@ mod tests {
assert_nearly_equals!(scores[0], 0.977973); assert_nearly_equals!(scores[0], 0.977973);
assert_nearly_equals!(scores[1], 0.84699446); assert_nearly_equals!(scores[1], 0.84699446);
} }
Ok(())
} }
#[test] #[test]
@@ -301,8 +296,8 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?;
index_writer.add_document(doc!(text=>"a"))?; index_writer.add_document(doc!(text=>"a"));
index_writer.add_document(doc!(text=>"b"))?; index_writer.add_document(doc!(text=>"b"));
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let term_a: Box<dyn Query> = Box::new(TermQuery::new( let term_a: Box<dyn Query> = Box::new(TermQuery::new(

View File

@@ -141,20 +141,19 @@ mod tests {
use crate::{DocAddress, Document, Index}; use crate::{DocAddress, Document, Index};
#[test] #[test]
fn test_boost_query_explain() -> crate::Result<()> { fn test_boost_query_explain() {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new())?; index_writer.add_document(Document::new());
index_writer.commit()?; assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let query = BoostQuery::new(Box::new(AllQuery), 0.2); let query = BoostQuery::new(Box::new(AllQuery), 0.2);
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap(); let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
assert_eq!( assert_eq!(
explanation.to_pretty_json(), explanation.to_pretty_json(),
"{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}" "{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}"
); )
Ok(())
} }
} }

View File

@@ -67,17 +67,17 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
/// let mut index_writer = index.writer(3_000_000)?; /// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Name of the Wind", /// title => "The Name of the Wind",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of Muadib", /// title => "The Diary of Muadib",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "A Dairy Cow", /// title => "A Dairy Cow",
/// ))?; /// ));
/// index_writer.add_document(doc!( /// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl", /// title => "The Diary of a Young Girl",
/// ))?; /// ));
/// index_writer.commit()?; /// index_writer.commit().unwrap();
/// } /// }
/// let reader = index.reader()?; /// let reader = index.reader()?;
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
@@ -129,18 +129,13 @@ impl FuzzyTermQuery {
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> { fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
// LEV_BUILDER is a HashMap, whose `get` method returns an Option // LEV_BUILDER is a HashMap, whose `get` method returns an Option
match LEV_BUILDER.get(&(self.distance, self.transposition_cost_one)) { match LEV_BUILDER.get(&(self.distance, false)) {
// Unwrap the option and build the Ok(AutomatonWeight) // Unwrap the option and build the Ok(AutomatonWeight)
Some(automaton_builder) => { Some(automaton_builder) => {
let term_text = self.term.as_str().ok_or_else(|| {
crate::TantivyError::InvalidArgument(
"The fuzzy term query requires a string term.".to_string(),
)
})?;
let automaton = if self.prefix { let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text) automaton_builder.build_prefix_dfa(self.term.text())
} else { } else {
automaton_builder.build_dfa(term_text) automaton_builder.build_dfa(self.term.text())
}; };
Ok(AutomatonWeight::new( Ok(AutomatonWeight::new(
self.term.field(), self.term.field(),
@@ -169,7 +164,6 @@ impl Query for FuzzyTermQuery {
mod test { mod test {
use super::FuzzyTermQuery; use super::FuzzyTermQuery;
use crate::assert_nearly_equals; use crate::assert_nearly_equals;
use crate::collector::Count;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::TEXT; use crate::schema::TEXT;
@@ -177,29 +171,32 @@ mod test {
use crate::Term; use crate::Term;
#[test] #[test]
pub fn test_fuzzy_term() -> crate::Result<()> { pub fn test_fuzzy_term() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let country_field = schema_builder.add_text_field("country", TEXT); let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "japan", country_field => "japan",
))?; ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "korea", country_field => "korea",
))?; ));
index_writer.commit()?; index_writer.commit().unwrap();
} }
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
// passes because Levenshtein distance is 1 (substitute 'o' with 'a') // passes because Levenshtein distance is 1 (substitute 'o' with 'a')
{ {
let term = Term::from_field_text(country_field, "japon"); let term = Term::from_field_text(country_field, "japon");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true); let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?; let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document"); assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0]; let (score, _) = top_docs[0];
assert_nearly_equals!(1.0, score); assert_nearly_equals!(1.0, score);
@@ -210,44 +207,23 @@ mod test {
let term = Term::from_field_text(country_field, "jap"); let term = Term::from_field_text(country_field, "jap");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true); let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?; let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 0, "Expected no document"); assert_eq!(top_docs.len(), 0, "Expected no document");
} }
// passes because prefix Levenshtein distance is 0 // passes because prefix Levenshtein distance is 0
{ {
let term = Term::from_field_text(country_field, "jap"); let term = Term::from_field_text(country_field, "jap");
let fuzzy_query = FuzzyTermQuery::new_prefix(term, 1, true); let fuzzy_query = FuzzyTermQuery::new_prefix(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?; let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document"); assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0]; let (score, _) = top_docs[0];
assert_nearly_equals!(1.0, score); assert_nearly_equals!(1.0, score);
} }
Ok(())
}
#[test]
pub fn test_fuzzy_term_transposition_cost_one() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(country_field => "japan"))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term_jaapn = Term::from_field_text(country_field, "jaapn");
{
let fuzzy_query_transposition = FuzzyTermQuery::new(term_jaapn.clone(), 1, true);
let count = searcher.search(&fuzzy_query_transposition, &Count)?;
assert_eq!(count, 1);
}
{
let fuzzy_query_transposition = FuzzyTermQuery::new(term_jaapn, 1, false);
let count = searcher.search(&fuzzy_query_transposition, &Count)?;
assert_eq!(count, 0);
}
Ok(())
} }
} }

View File

@@ -0,0 +1,70 @@
use docset::DocSet;
use query::Scorer;
use DocId;
use Score;
use SkipResult;
/// Creates a `DocSet` that iterate through the intersection of two `DocSet`s.
pub struct IntersectionTwoTerms<TDocSet> {
left: TDocSet,
right: TDocSet
}
impl<TDocSet: DocSet> IntersectionTwoTerms<TDocSet> {
pub fn new(left: TDocSet, right: TDocSet) -> IntersectionTwoTerms<TDocSet> {
IntersectionTwoTerms {
left,
right
}
}
}
impl<TDocSet: DocSet> DocSet for IntersectionTwoTerms<TDocSet> {
fn advance(&mut self) -> bool {
let (left, right) = (&mut self.left, &mut self.right);
if !left.advance() {
return false;
}
let mut candidate = left.doc();
loop {
match right.skip_next(candidate) {
SkipResult::Reached => {
return true;
}
SkipResult::End => {
return false;
}
SkipResult::OverStep => {
candidate = right.doc();
}
}
match left.skip_next(candidate) {
SkipResult::Reached => {
return true;
}
SkipResult::End => {
return false;
}
SkipResult::OverStep => {
candidate = left.doc();
}
}
}
}
fn doc(&self) -> DocId {
self.left.doc()
}
fn size_hint(&self) -> u32 {
self.left.size_hint().min(self.right.size_hint())
}
}
impl<TScorer: Scorer> Scorer for IntersectionTwoTerms<TScorer> {
fn score(&mut self) -> Score {
self.left.score() + self.right.score()
}
}

View File

@@ -180,7 +180,7 @@ impl MoreLikeThis {
// extract the raw value, possibly tokenizing & filtering to update the term frequency map // extract the raw value, possibly tokenizing & filtering to update the term frequency map
match field_entry.field_type() { match field_entry.field_type() {
FieldType::Facet(_) => { FieldType::HierarchicalFacet(_) => {
let facets: Vec<&str> = field_values let facets: Vec<&str> = field_values
.iter() .iter()
.map(|field_value| match *field_value.value() { .map(|field_value| match *field_value.value() {

View File

@@ -61,11 +61,19 @@ impl Query for MoreLikeThisQuery {
} }
/// The builder for more-like-this query /// The builder for more-like-this query
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone)]
pub struct MoreLikeThisQueryBuilder { pub struct MoreLikeThisQueryBuilder {
mlt: MoreLikeThis, mlt: MoreLikeThis,
} }
impl Default for MoreLikeThisQueryBuilder {
fn default() -> Self {
Self {
mlt: MoreLikeThis::default(),
}
}
}
impl MoreLikeThisQueryBuilder { impl MoreLikeThisQueryBuilder {
/// Sets the minimum document frequency. /// Sets the minimum document frequency.
/// ///
@@ -176,20 +184,20 @@ mod tests {
use crate::DocAddress; use crate::DocAddress;
use crate::Index; use crate::Index;
fn create_test_index() -> crate::Result<Index> { fn create_test_index() -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let body = schema_builder.add_text_field("body", TEXT | STORED); let body = schema_builder.add_text_field("body", TEXT | STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"))?; index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"));
index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"))?; index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"));
index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"))?; index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"));
index_writer.add_document(doc!(title => "ddd", body=> "a lady was riding and old bike"))?; index_writer.add_document(doc!(title => "ddd", body=> "a lady was riding and old bike"));
index_writer.add_document(doc!(title => "eee", body=> "Yes, my lady."))?; index_writer.add_document(doc!(title => "eee", body=> "Yes, my lady."));
index_writer.commit()?; index_writer.commit().unwrap();
Ok(index) index
} }
#[test] #[test]
@@ -235,9 +243,9 @@ mod tests {
} }
#[test] #[test]
fn test_more_like_this_query() -> crate::Result<()> { fn test_more_like_this_query() {
let index = create_test_index()?; let index = create_test_index();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
// search base 1st doc with words [sea, and] skipping [old] // search base 1st doc with words [sea, and] skipping [old]
@@ -250,7 +258,7 @@ mod tests {
.with_boost_factor(1.0) .with_boost_factor(1.0)
.with_stop_words(vec!["old".to_string()]) .with_stop_words(vec!["old".to_string()])
.with_document(DocAddress::new(0, 0)); .with_document(DocAddress::new(0, 0));
let top_docs = searcher.search(&query, &TopDocs::with_limit(5))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(5)).unwrap();
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect(); let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
doc_ids.sort_unstable(); doc_ids.sort_unstable();
@@ -266,12 +274,11 @@ mod tests {
.with_max_word_length(5) .with_max_word_length(5)
.with_boost_factor(1.0) .with_boost_factor(1.0)
.with_document(DocAddress::new(0, 4)); .with_document(DocAddress::new(0, 4));
let top_docs = searcher.search(&query, &TopDocs::with_limit(5))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(5)).unwrap();
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect(); let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
doc_ids.sort_unstable(); doc_ids.sort_unstable();
assert_eq!(doc_ids.len(), 2); assert_eq!(doc_ids.len(), 2);
assert_eq!(doc_ids, vec![3, 4]); assert_eq!(doc_ids, vec![3, 4]);
Ok(())
} }
} }

View File

@@ -18,34 +18,34 @@ pub mod tests {
use crate::DocId; use crate::DocId;
use crate::{DocAddress, TERMINATED}; use crate::{DocAddress, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> { pub fn create_index(texts: &[&'static str]) -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
for &text in texts { for &text in texts {
let doc = doc!(text_field=>text); let doc = doc!(text_field=>text);
index_writer.add_document(doc)?; index_writer.add_document(doc);
} }
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
Ok(index) index
} }
#[test] #[test]
pub fn test_phrase_query() -> crate::Result<()> { pub fn test_phrase_query() {
let index = create_index(&[ let index = create_index(&[
"b b b d c g c", "b b b d c g c",
"a b b d c g c", "a b b d c g c",
"a b a b c", "a b a b c",
"c a b a d ga a", "c a b a d ga a",
"a b c", "a b c",
])?; ]);
let schema = index.schema(); let schema = index.schema();
let text_field = schema.get_field("text").unwrap(); let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| { let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts let terms: Vec<Term> = texts
.iter() .iter()
@@ -54,7 +54,7 @@ pub mod tests {
let phrase_query = PhraseQuery::new(terms); let phrase_query = PhraseQuery::new(terms);
let test_fruits = searcher let test_fruits = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.unwrap(); .expect("search should succeed");
test_fruits test_fruits
.docs() .docs()
.iter() .iter()
@@ -66,12 +66,11 @@ pub mod tests {
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]); assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
assert!(test_query(vec!["g", "ewrwer"]).is_empty()); assert!(test_query(vec!["g", "ewrwer"]).is_empty());
assert!(test_query(vec!["g", "a"]).is_empty()); assert!(test_query(vec!["g", "a"]).is_empty());
Ok(())
} }
#[test] #[test]
pub fn test_phrase_query_simple() -> crate::Result<()> { pub fn test_phrase_query_simple() -> crate::Result<()> {
let index = create_index(&["a b b d c g c", "a b a b c"])?; let index = create_index(&["a b b d c g c", "a b a b c"]);
let text_field = index.schema().get_field("text").unwrap(); let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let terms: Vec<Term> = vec!["a", "b", "c"] let terms: Vec<Term> = vec!["a", "b", "c"]
@@ -87,17 +86,17 @@ pub mod tests {
} }
#[test] #[test]
pub fn test_phrase_query_no_score() -> crate::Result<()> { pub fn test_phrase_query_no_score() {
let index = create_index(&[ let index = create_index(&[
"b b b d c g c", "b b b d c g c",
"a b b d c g c", "a b b d c g c",
"a b a b c", "a b a b c",
"c a b a d ga a", "c a b a d ga a",
"a b c", "a b c",
])?; ]);
let schema = index.schema(); let schema = index.schema();
let text_field = schema.get_field("text").unwrap(); let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| { let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts let terms: Vec<Term> = texts
.iter() .iter()
@@ -106,7 +105,7 @@ pub mod tests {
let phrase_query = PhraseQuery::new(terms); let phrase_query = PhraseQuery::new(terms);
let test_fruits = searcher let test_fruits = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE) .search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE)
.unwrap(); .expect("search should succeed");
test_fruits test_fruits
.docs() .docs()
.iter() .iter()
@@ -118,11 +117,10 @@ pub mod tests {
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]); assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
assert!(test_query(vec!["g", "ewrwer"]).is_empty()); assert!(test_query(vec!["g", "ewrwer"]).is_empty());
assert!(test_query(vec!["g", "a"]).is_empty()); assert!(test_query(vec!["g", "a"]).is_empty());
Ok(())
} }
#[test] #[test]
pub fn test_phrase_query_no_positions() -> crate::Result<()> { pub fn test_phrase_query_no_positions() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::TextFieldIndexing; use crate::schema::TextFieldIndexing;
@@ -137,34 +135,33 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let phrase_query = PhraseQuery::new(vec![ let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"), Term::from_field_text(text_field, "b"),
]); ]);
let search_error = searcher let search_result = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.err(); .map(|_| ());
assert!(matches!( assert!(matches!(
search_error, search_result,
Some(crate::TantivyError::SchemaError(msg)) Err(crate::TantivyError::SchemaError(msg))
if msg == "Applied phrase query on field \"text\", which does not have positions \ if msg == "Applied phrase query on field \"text\", which does not have positions \
indexed" indexed"
)); ));
Ok(())
} }
#[test] #[test]
pub fn test_phrase_score() -> crate::Result<()> { pub fn test_phrase_score() {
let index = create_index(&["a b c", "a b c a b"])?; let index = create_index(&["a b c", "a b c a b"]);
let schema = index.schema(); let schema = index.schema();
let text_field = schema.get_field("text").unwrap(); let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| { let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts let terms: Vec<Term> = texts
.iter() .iter()
@@ -180,24 +177,23 @@ pub mod tests {
let scores = test_query(vec!["a", "b"]); let scores = test_query(vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482); assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664); assert_nearly_equals!(scores[1], 0.46844664);
Ok(())
} }
#[test] // motivated by #234 #[test] // motivated by #234
pub fn test_phrase_query_docfreq_order() -> crate::Result<()> { pub fn test_phrase_query_docfreq_order() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a"))?; index_writer.add_document(doc!(text_field=>"b a"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| { let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts let terms: Vec<Term> = texts
.iter() .iter()
@@ -212,19 +208,18 @@ pub mod tests {
}; };
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]); assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]);
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]); assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]);
Ok(())
} }
#[test] // motivated by #234 #[test] // motivated by #234
pub fn test_phrase_query_non_trivial_offsets() -> crate::Result<()> { pub fn test_phrase_query_non_trivial_offsets() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?; index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
index_writer.commit()?; assert!(index_writer.commit().is_ok());
} }
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<(usize, &str)>| { let test_query = |texts: Vec<(usize, &str)>| {
@@ -250,6 +245,5 @@ pub mod tests {
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]); assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty()); assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]); assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
Ok(())
} }
} }

View File

@@ -116,18 +116,19 @@ mod tests {
use crate::{DocSet, Term}; use crate::{DocSet, Term};
#[test] #[test]
pub fn test_phrase_count() -> crate::Result<()> { pub fn test_phrase_count() {
let index = create_index(&["a c", "a a b d a b c", " a b"])?; let index = create_index(&["a c", "a a b d a b c", " a b"]);
let schema = index.schema(); let schema = index.schema();
let text_field = schema.get_field("text").unwrap(); let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let phrase_query = PhraseQuery::new(vec![ let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"), Term::from_field_text(text_field, "b"),
]); ]);
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap(); let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
let mut phrase_scorer = phrase_weight let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)? .phrase_scorer(searcher.segment_reader(0u32), 1.0)
.unwrap()
.unwrap(); .unwrap();
assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.phrase_count(), 2);
@@ -135,6 +136,5 @@ mod tests {
assert_eq!(phrase_scorer.doc(), 2); assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1); assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED); assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
} }
} }

Some files were not shown because too many files have changed in this diff Show More