Compare commits

..

72 Commits

Author SHA1 Message Date
Paul Masurel
89f91b1b58 first stab 2021-10-06 12:10:16 +09:00
Paul Masurel
19965c46bc Added wasm-mt 2021-10-06 10:45:17 +09:00
dependabot[bot]
4d05b26e7a Update lru requirement from 0.6.5 to 0.7.0 (#1165)
Updates the requirements on [lru](https://github.com/jeromefroe/lru-rs) to permit the latest version.
- [Release notes](https://github.com/jeromefroe/lru-rs/releases)
- [Changelog](https://github.com/jeromefroe/lru-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/jeromefroe/lru-rs/compare/0.6.5...0.7.0)

---
updated-dependencies:
- dependency-name: lru
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-10-06 05:50:24 +09:00
Paul Masurel
0855649986 Leaning more on the alive (vs delete) semantics. (#1164) 2021-10-05 18:53:29 +09:00
PSeitz
d828e58903 Merge pull request #1163 from PSeitz/reduce_mem_usage
reduce mem usage
2021-10-01 08:03:41 +02:00
Pascal Seitz
aa0396fe27 fix variable names 2021-10-01 13:48:51 +08:00
Pascal Seitz
8d8315f8d0 prealloc vec in postinglist 2021-09-29 09:02:38 +08:00
Pascal Seitz
078c0a2e2e reserve vec 2021-09-29 08:45:04 +08:00
Pascal Seitz
f21e8dd875 use only segment ordinal in docidmapping 2021-09-29 08:44:56 +08:00
Tomoko Uchida
74e36c7e97 Add unit tests for tokenizers and filters (#1156)
* add unit test for SimpleTokenizer
* add unit tests for tokenizers and filters.
2021-09-27 10:22:01 +09:00
PSeitz
f27ae04282 fix slope calculation in multilinear interpol (#1161)
add test to check for compression
2021-09-27 10:14:03 +09:00
PSeitz
0ce49c9dd4 use lz4_flex 0.9.0 (#1160) 2021-09-27 10:12:20 +09:00
PSeitz
fe8e58e078 Merge pull request #1154 from PSeitz/delete_bitset
add DeleteBitSet iterator
2021-09-24 09:37:39 +02:00
Pascal Seitz
efc0d8341b fix comment 2021-09-24 15:09:21 +08:00
Pascal Seitz
22bcc83d10 fix padding in initialization 2021-09-24 14:43:04 +08:00
Pascal Seitz
5ee5037934 create and use ReadSerializedBitSet 2021-09-24 12:53:33 +08:00
Pascal Seitz
c217bfed1e cargo fmt 2021-09-23 21:02:19 +08:00
Pascal Seitz
c27ccd3e24 improve naming 2021-09-23 21:02:09 +08:00
Paul Masurel
367f5da782 Fixed comment to the index accessor 2021-09-23 21:53:48 +09:00
Mestery
b256df6599 add index accessor for index writer (#1159)
* add index accessor for index writer

* Update src/indexer/index_writer.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>
2021-09-23 21:49:20 +09:00
Pascal Seitz
d7a6a409a1 renames 2021-09-23 20:33:11 +08:00
Pascal Seitz
a1f5cead96 AliveBitSet instead of DeleteBitSet 2021-09-23 20:03:57 +08:00
dependabot[bot]
37c5fe3c86 Update memmap2 requirement from 0.4 to 0.5 (#1157)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Release notes](https://github.com/RazrFalcon/memmap2-rs/releases)
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v0.4.0...v0.5.0)

---
updated-dependencies:
- dependency-name: memmap2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-09-23 20:18:27 +09:00
Pascal Seitz
4583fa270b fixes 2021-09-23 10:39:53 +08:00
Pascal Seitz
beb3a5bd73 fix len 2021-09-18 17:58:15 +08:00
Pascal Seitz
93cbd52bf0 move code to biset, add inline, add benchmark 2021-09-18 17:35:22 +08:00
Pascal Seitz
c22177a005 add iterator 2021-09-17 15:29:27 +08:00
Pascal Seitz
4da71273e1 add de/serialization for bitset
remove len footgun
2021-09-17 10:28:12 +08:00
dependabot[bot]
2c78b31aab Update memmap2 requirement from 0.3 to 0.4 (#1155)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Release notes](https://github.com/RazrFalcon/memmap2-rs/releases)
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v.0.3.0...v0.4.0)
2021-09-17 08:52:52 +09:00
Pascal Seitz
4ae1d87632 add DeleteBitSet iterator 2021-09-15 23:10:04 +08:00
Paul Masurel
46b86a7976 Bounced version and edited changelog 2021-09-10 23:05:09 +09:00
PSeitz
3bc177e69d fix #1151 (#1152)
* fix #1151

Fixes a off by one error in the stats for the index fast field in the multi value fast field.
When retrieving the data range for a docid, `get(doc)..get(docid+1)` is requested. On creation
the num_vals statistic was set to doc instead of docid + 1. In the multivaluelinearinterpol fast
field the last value was therefore not serialized (and would return 0 instead in most cases).
So the last document get(lastdoc)..get(lastdoc + 1) would return the invalid range `value..0`.

This PR adds a proptest to cover this scenario. A combination of a large number values, since multilinear
interpolation is only active for more than 5_000 values, and a merge is required.
2021-09-10 23:00:37 +09:00
PSeitz
319609e9c1 test cargo-llvm-cov (#1149) 2021-09-03 22:00:43 +09:00
Kanji Yomoda
9d87b89718 Fix incorrect comment for Index::create_in_dir (#1148)
* Fix incorrect comment for Index::create_in_dir
2021-09-03 10:37:16 +09:00
Tomoko Uchida
dd81e38e53 Add WhitespaceTokenizer (#1147)
* Add WhitespaceTokenizer.
2021-08-29 18:20:49 +09:00
Paul Masurel
9f32b22602 Preparing for release. 2021-08-26 09:07:08 +09:00
sigaloid
096ce7488e Resolve some clippys, format (#1144)
* cargo +nightly clippy --fix -Z unstable-options
2021-08-26 08:46:00 +09:00
PSeitz
a1782dd172 Update index_sorting.md 2021-08-25 07:55:50 +01:00
PSeitz
000d76b11a Update index_sorting.md 2021-08-24 19:28:06 +01:00
PSeitz
abd29f6646 Update index_sorting.md 2021-08-24 19:26:19 +01:00
PSeitz
b4ecf0ab2f Merge pull request #1146 from tantivy-search/sorting_doc
add sorting to book
2021-08-23 17:37:54 +01:00
Pascal Seitz
798f7dbf67 add sorting to book 2021-08-23 17:36:41 +01:00
PSeitz
06a2e47c8d Merge pull request #1145 from tantivy-search/blub2
cargo fmt
2021-08-21 18:52:50 +01:00
Pascal Seitz
e0b83eb291 cargo fmt 2021-08-21 18:52:10 +01:00
PSeitz
13401f46ea add wildcard mention 2021-08-21 18:10:33 +01:00
PSeitz
1a45b030dc Merge pull request #1141 from tantivy-search/tantivy_common
dissolve common module
2021-08-20 08:03:37 +01:00
Pascal Seitz
62052bcc2d add missing test function
closes #1139
2021-08-20 07:26:22 +01:00
Pascal Seitz
3265f7bec3 dissolve common module 2021-08-19 23:26:34 +01:00
Pascal Seitz
ee0881712a move bitset to common crate, move composite file to directory 2021-08-19 17:45:09 +01:00
PSeitz
483e0336b6 Merge pull request #1140 from tantivy-search/tantivy_common
rename common to tantivy-common
2021-08-19 13:02:54 +01:00
Pascal Seitz
3e8f267e33 rename common to tantivy-common 2021-08-19 10:27:20 +01:00
Paul Masurel
3b247fd968 Version bump 2021-08-19 10:12:30 +09:00
Paul Masurel
750f6e6479 Removed obsolete unit test (#1138) 2021-08-19 10:07:49 +09:00
Evance Soumaoro
5b475e6603 Checksum validation using active files (#1130)
* now validate checksum uses segment files not managed files
2021-08-19 10:03:20 +09:00
PSeitz
0ca7f73dc5 add docs badge, fix build badge 2021-08-13 19:40:33 +01:00
PSeitz
47ed18845e Merge pull request #1136 from tantivy-search/minor_fixes
more docs detail
2021-08-13 18:11:47 +01:00
Pascal Seitz
dc141cdb29 more docs detail
remove code duplicate
2021-08-13 17:40:13 +01:00
PSeitz
f6cf6e889b Merge pull request #1133 from tantivy-search/merge_overflow
test doc_freq and term_freq in sorted index
2021-08-05 07:53:46 +01:00
Pascal Seitz
f379a80233 test doc_freq and term_freq in sorted index 2021-08-03 11:38:05 +01:00
PSeitz
4a320fd1ff fix delta position in merge and index sorting (#1132)
fixes #1125
2021-08-03 18:06:36 +09:00
PSeitz
85d23e8e3b Merge pull request #1129 from tantivy-search/merge_overflow
add long running test in ci
2021-08-02 15:54:31 +01:00
Pascal Seitz
022ab9d298 don't run as pr 2021-08-02 15:44:00 +01:00
Pascal Seitz
605e8603dc add positions to long running test 2021-08-02 15:29:49 +01:00
Pascal Seitz
70f160b329 add long running test in ci 2021-08-02 11:35:39 +01:00
PSeitz
6d265e6bed fix gh action name 2021-08-02 10:38:01 +01:00
PSeitz
fdc512391b Merge pull request #1128 from tantivy-search/merge_overflow
add sort to functional test, add env for iterations
2021-08-02 10:29:16 +01:00
Pascal Seitz
108714c934 add sort to functional test, add env for iterations 2021-08-02 10:11:17 +01:00
Paul Masurel
44e8cf98a5 Cargo fmt 2021-07-30 15:30:01 +09:00
Paul Masurel
f0ee69d9e9 Remove the complicated block search logic for a simpler branchless (#1124)
binary search

The code is simpler and faster.

Before
test postings::bench::bench_segment_intersection                                                                         ... bench:   2,093,697 ns/iter (+/- 115,509)
test postings::bench::bench_skip_next_p01                                                                                ... bench:      58,585 ns/iter (+/- 796)
test postings::bench::bench_skip_next_p1                                                                                 ... bench:     160,872 ns/iter (+/- 5,164)
test postings::bench::bench_skip_next_p10                                                                                ... bench:     615,229 ns/iter (+/- 25,108)
test postings::bench::bench_skip_next_p90                                                                                ... bench:   1,120,509 ns/iter (+/- 22,271)

After
test postings::bench::bench_segment_intersection                                                                         ... bench:   1,747,726 ns/iter (+/- 52,867)
test postings::bench::bench_skip_next_p01                                                                                ... bench:      55,205 ns/iter (+/- 714)
test postings::bench::bench_skip_next_p1                                                                                 ... bench:     131,433 ns/iter (+/- 2,814)
test postings::bench::bench_skip_next_p10                                                                                ... bench:     478,830 ns/iter (+/- 12,794)
test postings::bench::bench_skip_next_p90                                                                                ... bench:     931,082 ns/iter (+/- 31,468)
2021-07-30 14:38:42 +09:00
Evance Soumaoro
b8a10c8406 switched to memmap2-rs (#1120) 2021-07-27 18:40:41 +09:00
PSeitz
ff4813529e add comments on compression (#1119) 2021-07-26 22:54:22 +09:00
PSeitz
470bc18e9b Merge pull request #1118 from tantivy-search/remove_rand
move rand to optional dependencies
2021-07-21 18:01:22 +01:00
105 changed files with 2066 additions and 1196 deletions

View File

@@ -1,27 +1,25 @@
name: coverage
name: Coverage
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
test:
name: coverage
runs-on: ubuntu-latest
container:
image: xd009642/tarpaulin:develop-nightly
options: --security-opt seccomp=unconfined
coverage:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Generate code coverage
run: |
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml
- name: Upload to codecov.io
uses: codecov/codecov-action@v1
- uses: actions/checkout@v2
- name: Install Rust
run: rustup toolchain install nightly --component llvm-tools-preview
- name: Install cargo-llvm-cov
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
- name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info
fail_ci_if_error: true

24
.github/workflows/long_running.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
name: Rust
on:
push:
branches: [ main ]
env:
CARGO_TERM_COLOR: always
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
jobs:
functional_test_unsorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run indexing_unsorted
run: cargo test indexing_unsorted -- --ignored
functional_test_sorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run indexing_sorted
run: cargo test indexing_sorted -- --ignored

View File

@@ -10,7 +10,7 @@ env:
CARGO_TERM_COLOR: always
jobs:
build:
test:
runs-on: ubuntu-latest

View File

@@ -1,3 +1,12 @@
Tantivy 0.16.1
========================
- Major Bugfix on multivalued fastfield. #1151
Tantivy 0.16.0
=========================
- Bugfix in the filesum check. (@evanxg852000) #1127
- Bugfix in positions when the index is sorted by a field. (@appaquet) #1125
Tantivy 0.15.3
=========================
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.16.0-dev"
version = "0.16.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -19,23 +19,24 @@ crc32fast = "1.2.1"
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true}
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
memmap2 = {version = "0.5", optional=true}
lz4_flex = { version = "0.9.0", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.2", optional = true }
log = "0.4.14"
serde = { version = "1.0.126", features = ["derive"] }
serde_closure = "0.3"
serde_json = "1.0.64"
num_cpus = "1.13"
fs2={ version = "0.4.3", optional = true }
levenshtein_automata = "0.2"
uuid = { version = "0.8.2", features = ["v4", "serde"] }
crossbeam = "0.8"
crossbeam = "0.8.1"
futures = { version = "0.3.15", features = ["thread-pool"] }
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version="0.1", path="./common" }
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.1", path="./ownedbytes" }
stable_deref_trait = "1.2"
@@ -50,11 +51,12 @@ fail = "0.4"
murmurhash32 = "0.2"
chrono = "0.4.19"
smallvec = "1.6.1"
rayon = "1.5"
lru = "0.6.5"
lru = "0.7.0"
fastdivide = "0.3"
itertools = "0.10.0"
measure_time = "0.7.0"
wasm-mt = "0.1"
wasm-mt-pool = "0.1"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -64,7 +66,9 @@ rand = "0.8.3"
maplit = "1.0.2"
matches = "0.1.8"
proptest = "1.0"
criterion = "0.3.4"
criterion = "0.3.5"
test-env-log = "0.2.7"
env_logger = "0.9.0"
[dev-dependencies.fail]
version = "0.4"
@@ -81,7 +85,7 @@ overflow-checks = true
[features]
default = ["mmap", "lz4-compression" ]
mmap = ["fs2", "tempfile", "memmap"]
mmap = ["fs2", "tempfile", "memmap2"]
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]

View File

@@ -1,9 +1,9 @@
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=main)](https://travis-ci.org/tantivy-search/tantivy)
[![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/)
[![Build Status](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml)
[![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/main?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.1.0"
version = "0.1.1"
edition = "2018"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -50,3 +50,32 @@ where
}
None
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}

View File

@@ -1,5 +1,5 @@
[package]
name = "common"
name = "tantivy-common"
version = "0.1.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
@@ -10,3 +10,8 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version="0.1", path="../ownedbytes" }
[dev-dependencies]
proptest = "1.0.0"
rand = "0.8.4"

View File

@@ -1,8 +1,11 @@
use std::fmt;
use ownedbytes::OwnedBytes;
use std::convert::TryInto;
use std::io::Write;
use std::u64;
use std::{fmt, io};
#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64);
pub struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -14,6 +17,7 @@ pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator {
type Item = u32;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest()
}
@@ -28,30 +32,54 @@ impl IntoIterator for TinySet {
}
impl TinySet {
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.0.to_le_bytes().as_ref())
}
#[inline]
pub fn deserialize(data: [u8; 8]) -> io::Result<Self> {
let val: u64 = u64::from_le_bytes(data);
Ok(TinySet(val))
}
/// Returns an empty `TinySet`.
#[inline]
pub fn empty() -> TinySet {
TinySet(0u64)
}
/// Returns a full `TinySet`.
#[inline]
pub fn full() -> TinySet {
TinySet::empty().complement()
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
#[inline]
/// Returns the complement of the set in `[0, 64[`.
///
/// Careful on making this function public, as it will break the padding handling in the last
/// bucket.
fn complement(self) -> TinySet {
TinySet(!self.0)
}
#[inline]
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
#[inline]
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
#[inline]
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
@@ -64,13 +92,21 @@ impl TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64[
/// Insert a new element within [0..64)
#[inline]
pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el))
}
/// Insert a new element within [0..64[
/// Removes an element within [0..64)
#[inline]
pub fn remove(self, el: u32) -> TinySet {
self.intersect(TinySet::singleton(el).complement())
}
/// Insert a new element within [0..64)
///
/// returns true if the set changed
#[inline]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
@@ -78,6 +114,16 @@ impl TinySet {
old != *self
}
/// Remove a element within [0..64)
///
/// returns true if the set changed
#[inline]
pub fn remove_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.remove(el);
old != *self
}
/// Returns the union of two tinysets
#[inline]
pub fn union(self, other: TinySet) -> TinySet {
@@ -123,7 +169,7 @@ impl TinySet {
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: usize,
len: u64,
max_value: u32,
}
@@ -132,8 +178,41 @@ fn num_buckets(max_val: u32) -> u32 {
}
impl BitSet {
/// serialize a `BitSet`.
///
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
for tinyset in self.tinysets.iter() {
tinyset.serialize(writer)?;
}
writer.flush()?;
Ok(())
}
/// Deserialize a `BitSet`.
///
#[cfg(test)]
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
data = &data[4..];
let mut len: u64 = 0;
let mut tinysets = vec![];
for chunk in data.chunks_exact(8) {
let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?;
len += tinyset.len() as u64;
tinysets.push(tinyset);
}
Ok(BitSet {
tinysets: tinysets.into_boxed_slice(),
len,
max_value,
})
}
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val[`.
/// within `[0, max_val)`.
pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
@@ -144,6 +223,23 @@ impl BitSet {
}
}
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
/// within `[0, max_val)`.
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let mut tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
// Fix padding
let lower = max_value % 64u32;
tinybisets[tinybisets.len() - 1] = TinySet::range_lower(lower);
BitSet {
tinysets: tinybisets,
len: max_value as u64,
max_value,
}
}
/// Removes all elements from the `BitSet`.
pub fn clear(&mut self) {
for tinyset in self.tinysets.iter_mut() {
@@ -153,10 +249,11 @@ impl BitSet {
/// Returns the number of elements in the `BitSet`.
pub fn len(&self) -> usize {
self.len
self.len as usize
}
/// Inserts an element in the `BitSet`
#[inline]
pub fn insert(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
@@ -168,7 +265,21 @@ impl BitSet {
};
}
/// Inserts an element in the `BitSet`
#[inline]
pub fn remove(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)
}
@@ -178,7 +289,7 @@ impl BitSet {
///
/// Reminder: the tiny set with the bucket `bucket`, represents the
/// elements from `bucket * 64` to `(bucket+1) * 64`.
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
self.tinysets[bucket as usize..]
.iter()
.cloned()
@@ -193,23 +304,149 @@ impl BitSet {
/// Returns the tiny bitset representing the
/// the set restricted to the number range from
/// `bucket * 64` to `(bucket + 1) * 64`.
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
pub fn tinyset(&self, bucket: u32) -> TinySet {
self.tinysets[bucket as usize]
}
}
/// Serialized BitSet.
#[derive(Clone)]
pub struct ReadSerializedBitSet {
data: OwnedBytes,
max_value: u32,
}
impl ReadSerializedBitSet {
pub fn open(data: OwnedBytes) -> Self {
let (max_value_data, data) = data.split(4);
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
ReadSerializedBitSet { data, max_value }
}
/// Number of elements in the bitset.
#[inline]
pub fn len(&self) -> usize {
self.iter_tinysets()
.map(|tinyset| tinyset.len() as usize)
.sum()
}
/// Iterate the tinyset on the fly from serialized data.
///
#[inline]
fn iter_tinysets<'a>(&'a self) -> impl Iterator<Item = TinySet> + 'a {
assert!((self.data.len()) % 8 == 0);
self.data.chunks_exact(8).map(move |chunk| {
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
tinyset
})
}
/// Iterate over the positions of the elements.
///
#[inline]
pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
self.iter_tinysets()
.enumerate()
.flat_map(move |(chunk_num, tinyset)| {
let chunk_base_val = chunk_num as u32 * 64;
tinyset
.into_iter()
.map(move |val| val + chunk_base_val)
.take_while(move |doc| *doc < self.max_value)
})
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool {
let byte_offset = el / 8u32;
let b: u8 = self.data[byte_offset as usize];
let shift = (el % 8) as u8;
b & (1u8 << shift) != 0
}
/// Maximum value the bitset may contain.
/// (Note this is not the maximum value contained in the set.)
///
/// A bitset has an intrinsic capacity.
/// It only stores elements within [0..max_value).
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
}
}
#[cfg(test)]
mod tests {
use super::BitSet;
use super::ReadSerializedBitSet;
use super::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::BitSetDocSet;
use crate::tests;
use crate::tests::generate_nonunique_unsorted;
use std::collections::BTreeSet;
use ownedbytes::OwnedBytes;
use rand::distributions::Bernoulli;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::collections::HashSet;
use std::convert::TryInto;
#[test]
fn test_read_serialized_bitset_full() {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 4);
}
#[test]
fn test_read_serialized_bitset_empty() {
let mut bitset = BitSet::with_max_value(5);
bitset.insert(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 1);
{
let bitset = BitSet::with_max_value(5);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 0);
}
}
#[test]
fn test_tiny_set_remove() {
{
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
assert_eq!(u.pop_lowest(), Some(5u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty()
.insert(63u32)
.insert(1)
.insert(5)
.remove(63u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert_eq!(u.pop_lowest(), Some(5u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(1).remove(63u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(1).remove(1u32);
assert!(u.pop_lowest().is_none());
}
}
#[test]
fn test_tiny_set() {
assert!(TinySet::empty().is_empty());
@@ -235,6 +472,21 @@ mod tests {
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(63u32).insert(5);
assert_eq!(u.pop_lowest(), Some(5u32));
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
{
let u = TinySet::empty().insert(63u32).insert(5);
let mut data = vec![];
u.serialize(&mut data).unwrap();
let mut u = TinySet::deserialize(data[..8].try_into().unwrap()).unwrap();
assert_eq!(u.pop_lowest(), Some(5u32));
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
}
#[test]
@@ -251,6 +503,16 @@ mod tests {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
// test deser
let mut data = vec![];
bitset.serialize(&mut data).unwrap();
let bitset = BitSet::deserialize(&data).unwrap();
for el in 0..max_value {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
assert_eq!(bitset.len(), els.len());
};
test_against_hashset(&[], 0);
@@ -263,29 +525,6 @@ mod tests {
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {
assert!(remaining);
assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
}
assert!(!remaining);
}
#[test]
fn test_bitset_num_buckets() {
use super::num_buckets;
@@ -338,12 +577,33 @@ mod tests {
assert_eq!(bitset.len(), 2);
bitset.insert(104u32);
assert_eq!(bitset.len(), 3);
bitset.remove(105u32);
assert_eq!(bitset.len(), 3);
bitset.remove(104u32);
assert_eq!(bitset.len(), 2);
bitset.remove(3u32);
assert_eq!(bitset.len(), 1);
bitset.remove(103u32);
assert_eq!(bitset.len(), 0);
}
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
StdRng::from_seed([seed_val; 32])
.sample_iter(&Bernoulli::new(ratio).unwrap())
.take(n as usize)
.enumerate()
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
.collect()
}
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
sample_with_seed(n, ratio, 4)
}
#[test]
fn test_bitset_clear() {
let mut bitset = BitSet::with_max_value(1_000);
let els = tests::sample(1_000, 0.01f64);
let els = sample(1_000, 0.01f64);
for &el in &els {
bitset.insert(el);
}

View File

@@ -1,9 +1,167 @@
use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod serialize;
mod vint;
mod writer;
pub use bitset::*;
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.deref().len()
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
})
}
#[cfg(test)]
pub mod test {
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use super::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
}

View File

@@ -7,6 +7,7 @@
- [Segments](./basis.md)
- [Defining your schema](./schema.md)
- [Facetting](./facetting.md)
- [Index Sorting](./index_sorting.md)
- [Innerworkings](./innerworkings.md)
- [Inverted index](./inverted_index.md)
- [Best practise](./inverted_index.md)

61
doc/src/index_sorting.md Normal file
View File

@@ -0,0 +1,61 @@
- [Index Sorting](#index-sorting)
+ [Why Sorting](#why-sorting)
* [Compression](#compression)
* [Top-N Optimization](#top-n-optimization)
* [Pruning](#pruning)
* [Other](#other)
+ [Usage](#usage)
# Index Sorting
Tantivy allows you to sort the index according to a property.
## Why Sorting
Presorting an index has several advantages:
###### Compression
When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5].
If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected.
###### Top-N Optimization
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
Note: Tantivy 0.16 does not do this optimization yet.
###### Pruning
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
Note: Tantivy 0.16 does not do this optimization yet.
###### Other?
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
```
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(settings);
let index = index_builder.create_in_ram().unwrap();
```
## Implementation details
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).

View File

@@ -86,12 +86,10 @@ impl Collector for StatsCollector {
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
let mut stats = Stats::default();
for segment_stats_opt in segment_stats {
if let Some(segment_stats) = segment_stats_opt {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
for segment_stats in segment_stats.into_iter().flatten() {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
Ok(stats.non_zero_count())
}

View File

@@ -9,8 +9,8 @@ description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { path = "../common/" }
tantivy-bitpacker = { path = "../bitpacker/" }
common = { version = "0.1", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
prettytable-rs = {version="0.8.0", optional= true}
rand = {version="0.8.3", optional= true}

View File

@@ -118,7 +118,7 @@ mod tests {
);
}
}
let actual_compression = data.len() as f32 / out.len() as f32;
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
(estimation, actual_compression)
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {

View File

@@ -239,11 +239,21 @@ mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(data, name);
>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.01);
assert!(estimate < 0.01);
}
#[test]

View File

@@ -1,3 +1,17 @@
/*!
MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the offset, but in blocks of 512.
With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / 512 = 0,45 bits per element.
The additional space required per element in a block is the the maximum deviation of the linear interpolation estimation function.
E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
Size per block:
Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
*/
use crate::FastFieldCodecReader;
use crate::FastFieldCodecSerializer;
use crate::FastFieldDataAccess;
@@ -43,7 +57,7 @@ struct Function {
impl Function {
fn calc_slope(&mut self) {
let num_vals = self.end_pos - self.start_pos;
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals);
}
// split the interpolation into two function, change self and return the second split
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
@@ -364,11 +378,22 @@ mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(data, name);
>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
assert!(actual_compression > 0.01);
}
#[test]
@@ -400,9 +425,11 @@ mod tests {
fn rand() {
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u64>() as u64)
.map(|_| rand::random::<u32>() as u64)
.collect::<Vec<_>>();
create_and_validate(&data, "random");
let (estimate, actual_compression) = create_and_validate(&data, "random");
dbg!(estimate);
dbg!(actual_compression);
data.reverse();
create_and_validate(&data, "random");

View File

@@ -4,6 +4,7 @@ name = "ownedbytes"
version = "0.1.0"
edition = "2018"
description = "Expose data as static slice"
license = "MIT"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@@ -178,9 +178,9 @@ pub trait Collector: Sync + Send {
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
if let Some(delete_bitset) = reader.delete_bitset() {
if let Some(alive_bitset) = reader.alive_bitset() {
weight.for_each(reader, &mut |doc, score| {
if delete_bitset.is_alive(doc) {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;

View File

@@ -629,10 +629,10 @@ impl Collector for TopDocs {
let heap_len = self.0.limit + self.0.offset;
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(delete_bitset) = reader.delete_bitset() {
if let Some(alive_bitset) = reader.alive_bitset() {
let mut threshold = Score::MIN;
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
if delete_bitset.is_deleted(doc) {
if alive_bitset.is_deleted(doc) {
return threshold;
}
let heap_item = ComparableDoc {

View File

@@ -1,203 +0,0 @@
mod bitset;
mod composite_file;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use byteorder::LittleEndian as Endianness;
pub use common::CountingWriter;
pub use common::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
})
}
#[cfg(test)]
pub(crate) mod test {
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use common::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
use tantivy_bitpacker::compute_num_bits;
pub use tantivy_bitpacker::minmax;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}
}

View File

@@ -1,5 +1,4 @@
use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder};
/// Search executor whether search request are single thread or multithread.
///
@@ -11,8 +10,6 @@ use rayon::{ThreadPool, ThreadPoolBuilder};
pub enum Executor {
/// Single thread variant of an Executor
SingleThread,
/// Thread pool variant of an Executor
ThreadPool(ThreadPool),
}
impl Executor {
@@ -21,15 +18,6 @@ impl Executor {
Executor::SingleThread
}
/// Creates an Executor that dispatches the tasks in a thread pool.
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> {
let pool = ThreadPoolBuilder::new()
.num_threads(num_threads)
.thread_name(move |num| format!("{}{}", prefix, num))
.build()?;
Ok(Executor::ThreadPool(pool))
}
/// Perform a map in the thread pool.
///
/// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
@@ -46,40 +34,6 @@ impl Executor {
) -> crate::Result<Vec<R>> {
match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
Executor::ThreadPool(pool) => {
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
let num_fruits = args_with_indices.len();
let fruit_receiver = {
let (fruit_sender, fruit_receiver) = channel::unbounded();
pool.scope(|scope| {
for arg_with_idx in args_with_indices {
scope.spawn(|_| {
let (idx, arg) = arg_with_idx;
let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) {
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
}
});
}
});
fruit_receiver
// This ends the scope of fruit_sender.
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but safe.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
}
}
}
}

View File

@@ -120,7 +120,7 @@ impl IndexBuilder {
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
@@ -229,7 +229,8 @@ impl Index {
/// Creates a new index using the `RamDirectory`.
///
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
/// This is useful for indexing small set of documents
/// for instances like unit test or temporary in memory index.
pub fn create_in_ram(schema: Schema) -> Index {
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
}
@@ -237,7 +238,7 @@ impl Index {
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
/// If a previous index was in this directory, then it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(
directory_path: P,
@@ -523,7 +524,22 @@ impl Index {
/// Returns the set of corrupted files
pub fn validate_checksum(&self) -> crate::Result<HashSet<PathBuf>> {
self.directory.list_damaged().map_err(Into::into)
let managed_files = self.directory.list_managed_files();
let active_segments_files: HashSet<PathBuf> = self
.searchable_segment_metas()?
.iter()
.flat_map(|segment_meta| segment_meta.list_files())
.collect();
let active_existing_files: HashSet<&PathBuf> =
active_segments_files.intersection(&managed_files).collect();
let mut damaged_files = HashSet::new();
for path in active_existing_files {
if !self.directory.validate_checksum(path)? {
damaged_files.insert((*path).clone());
}
}
Ok(damaged_files)
}
}

View File

@@ -101,6 +101,7 @@ impl SegmentMeta {
/// Returns the list of files that
/// are required for the segment meta.
/// Note: Some of the returned files may not exist depending on the state of the segment.
///
/// This is useful as the way tantivy removes files
/// is by removing all files that have been created by tantivy

View File

@@ -1,6 +1,5 @@
use std::io;
use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::TermInfo;
@@ -8,6 +7,7 @@ use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::termdict::TermDictionary;
use common::BinarySerializable;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.

View File

@@ -2,8 +2,10 @@ use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
use crate::core::SegmentId;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::DeleteBitSet;
use crate::error::DataCorruption;
use crate::fastfield::AliveBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
@@ -14,7 +16,6 @@ use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::DocId;
use crate::{common::CompositeFile, error::DataCorruption};
use fail::fail_point;
use std::fmt;
use std::sync::Arc;
@@ -46,7 +47,7 @@ pub struct SegmentReader {
fieldnorm_readers: FieldNormReaders,
store_file: FileSlice,
delete_bitset_opt: Option<DeleteBitSet>,
alive_bitset_opt: Option<AliveBitSet>,
schema: Schema,
}
@@ -71,14 +72,12 @@ impl SegmentReader {
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
self.max_doc - self.num_docs
}
/// Returns true iff some of the documents of the segment have been deleted.
pub fn has_deletes(&self) -> bool {
self.delete_bitset().is_some()
self.num_deleted_docs() > 0
}
/// Accessor to a segment's fast field reader given a field.
@@ -169,10 +168,10 @@ impl SegmentReader {
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::Delete)?;
let delete_bitset = DeleteBitSet::open(delete_data)?;
Some(delete_bitset)
let alive_bitset_opt = if segment.meta().has_deletes() {
let alive_bitset_bytes = segment.open_read(SegmentComponent::Delete)?.read_bytes()?;
let alive_bitset = AliveBitSet::open(alive_bitset_bytes);
Some(alive_bitset)
} else {
None
};
@@ -187,7 +186,7 @@ impl SegmentReader {
fieldnorm_readers,
segment_id: segment.id(),
store_file,
delete_bitset_opt,
alive_bitset_opt,
positions_composite,
schema,
})
@@ -273,21 +272,25 @@ impl SegmentReader {
/// Returns the bitset representing
/// the documents that have been deleted.
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
self.delete_bitset_opt.as_ref()
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
self.alive_bitset_opt.as_ref()
}
/// Returns true iff the `doc` is marked
/// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
self.delete_bitset()
self.alive_bitset()
.map(|delete_set| delete_set.is_deleted(doc))
.unwrap_or(false)
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
if let Some(alive_bitset) = &self.alive_bitset_opt {
Box::new(alive_bitset.iter_alive())
} else {
Box::new(0u32..self.max_doc)
}
}
/// Summarize total space usage of this segment.
@@ -300,9 +303,9 @@ impl SegmentReader {
self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(),
self.get_store_reader()?.space_usage(),
self.delete_bitset_opt
self.alive_bitset_opt
.as_ref()
.map(DeleteBitSet::space_usage)
.map(AliveBitSet::space_usage)
.unwrap_or(0),
))
}

View File

@@ -1,18 +1,17 @@
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::common::VInt;
use crate::directory::FileSlice;
use crate::directory::{TerminatingWrite, WritePtr};
use crate::schema::Field;
use crate::space_usage::FieldUsage;
use crate::space_usage::PerFieldSpaceUsage;
use common::BinarySerializable;
use common::CountingWriter;
use common::HasLen;
use common::VInt;
use std::collections::HashMap;
use std::io::{self, Read, Write};
use std::iter::ExactSizeIterator;
use std::ops::Range;
use super::HasLen;
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
field: Field,
@@ -188,10 +187,10 @@ impl CompositeFile {
mod test {
use super::{CompositeFile, CompositeWrite};
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::directory::{Directory, RamDirectory};
use crate::schema::Field;
use common::BinarySerializable;
use common::VInt;
use std::io::Write;
use std::path::Path;

View File

@@ -1,7 +1,7 @@
use stable_deref_trait::StableDeref;
use crate::common::HasLen;
use crate::directory::OwnedBytes;
use common::HasLen;
use std::fmt;
use std::ops::Range;
use std::sync::{Arc, Weak};
@@ -32,12 +32,6 @@ impl FileHandle for &'static [u8] {
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.deref().len()
}
}
impl<B> From<B> for FileSlice
where
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
@@ -178,7 +172,7 @@ impl HasLen for FileSlice {
#[cfg(test)]
mod tests {
use super::{FileHandle, FileSlice};
use crate::common::HasLen;
use common::HasLen;
use std::io;
#[test]

View File

@@ -1,10 +1,10 @@
use crate::directory::error::Incompatibility;
use crate::directory::FileSlice;
use crate::{
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
directory::{AntiCallToken, TerminatingWrite},
Version, INDEX_FORMAT_VERSION,
};
use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen};
use crc32fast::Hasher;
use serde::{Deserialize, Serialize};
use std::io;
@@ -156,10 +156,8 @@ mod tests {
use crate::directory::footer::Footer;
use crate::directory::OwnedBytes;
use crate::{
common::BinarySerializable,
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
};
use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice};
use common::BinarySerializable;
use std::io;
#[test]

View File

@@ -1,4 +1,4 @@
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::GarbageCollectionResult;
@@ -248,24 +248,15 @@ impl ManagedDirectory {
Ok(footer.crc() == crc)
}
/// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut managed_paths = self
/// List all managed files
pub fn list_managed_files(&self) -> HashSet<PathBuf> {
let managed_paths = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in list damaged.")
.managed_paths
.clone();
managed_paths.remove(*META_FILEPATH);
let mut damaged_files = HashSet::new();
for path in managed_paths {
if !self.validate_checksum(&path)? {
damaged_files.insert(path);
}
}
Ok(damaged_files)
managed_paths
}
}
@@ -336,7 +327,6 @@ mod tests_mmap_specific {
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
use std::collections::HashSet;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
@@ -405,39 +395,4 @@ mod tests_mmap_specific {
}
assert!(!managed_directory.exists(test_path1).unwrap());
}
#[test]
fn test_checksum() -> crate::Result<()> {
let test_path1: &'static Path = Path::new("some_path_for_test");
let test_path2: &'static Path = Path::new("other_test_path");
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let mmap_directory = MmapDirectory::open(&tempdir_path)?;
let managed_directory = ManagedDirectory::wrap(mmap_directory)?;
let mut write = managed_directory.open_write(test_path1)?;
write.write_all(&[0u8, 1u8])?;
write.terminate()?;
let mut write = managed_directory.open_write(test_path2)?;
write.write_all(&[3u8, 4u8, 5u8])?;
write.terminate()?;
let read_file = managed_directory.open_read(test_path2)?.read_bytes()?;
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
assert!(managed_directory.list_damaged().unwrap().is_empty());
let mut corrupted_path = tempdir_path;
corrupted_path.push(test_path2);
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
file.write_all(&[255u8])?;
file.flush()?;
drop(file);
let damaged = managed_directory.list_damaged()?;
assert_eq!(damaged.len(), 1);
assert!(damaged.contains(test_path2));
Ok(())
}
}

View File

@@ -11,7 +11,7 @@ use crate::directory::{AntiCallToken, FileHandle, OwnedBytes};
use crate::directory::{ArcBytes, WeakArcBytes};
use crate::directory::{TerminatingWrite, WritePtr};
use fs2::FileExt;
use memmap::Mmap;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use stable_deref_trait::StableDeref;
use std::convert::From;
@@ -53,7 +53,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
return Ok(None);
}
unsafe {
memmap::Mmap::map(&file)
memmap2::Mmap::map(&file)
.map(Some)
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
}
@@ -485,13 +485,14 @@ mod tests {
// The following tests are specific to the MmapDirectory
use super::*;
use crate::indexer::LogMergePolicy;
use crate::Index;
use crate::ReloadPolicy;
use crate::{common::HasLen, indexer::LogMergePolicy};
use crate::{
schema::{Schema, SchemaBuilder, TEXT},
IndexSettings,
};
use common::HasLen;
#[test]
fn test_open_non_existent_path() {

View File

@@ -20,6 +20,9 @@ mod watch_event_router;
/// Errors specific to the directory module.
pub mod error;
mod composite_file;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::directory::DirectoryLock;
pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};

View File

@@ -1,9 +1,10 @@
use crate::core::META_FILEPATH;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use crate::directory::AntiCallToken;
use crate::directory::WatchCallbackList;
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr};
use crate::{common::HasLen, core::META_FILEPATH};
use common::HasLen;
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;

View File

@@ -1,4 +1,4 @@
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::DocId;
use std::borrow::Borrow;
use std::borrow::BorrowMut;
@@ -85,11 +85,11 @@ pub trait DocSet: Send {
/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let mut count = 0u32;
let mut doc = self.doc();
while doc != TERMINATED {
if !delete_bitset.is_deleted(doc) {
if alive_bitset.is_alive(doc) {
count += 1u32;
}
doc = self.advance();
@@ -130,8 +130,8 @@ impl<'a> DocSet for &'a mut dyn DocSet {
(**self).size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
(**self).count(delete_bitset)
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
(**self).count(alive_bitset)
}
fn count_including_deleted(&mut self) -> u32 {
@@ -160,9 +160,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(delete_bitset)
unboxed.count(alive_bitset)
}
fn count_including_deleted(&mut self) -> u32 {

View File

@@ -0,0 +1,202 @@
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::ReadSerializedBitSet;
use ownedbytes::OwnedBytes;
use std::io;
use std::io::Write;
/// Write a alive `BitSet`
///
/// where `alive_bitset` is the set of alive `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
alive_bitset.serialize(writer)?;
Ok(())
}
/// Set of alive `DocId`s.
#[derive(Clone)]
pub struct AliveBitSet {
num_alive_docs: usize,
bitset: ReadSerializedBitSet,
num_bytes: ByteCount,
}
impl AliveBitSet {
#[cfg(test)]
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_full(max_doc);
for &doc in deleted_docs {
bitset.remove(doc);
}
let mut alive_bitset_buffer = Vec::new();
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
Self::open(alive_bitset_bytes)
}
/// Opens a delete bitset given its file.
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let num_bytes = bytes.len();
let bitset = ReadSerializedBitSet::open(bytes);
AliveBitSet {
num_alive_docs: bitset.len(),
bitset,
num_bytes,
}
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
#[inline]
pub fn is_alive(&self, doc: DocId) -> bool {
self.bitset.contains(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
!self.is_alive(doc)
}
/// Iterate over the alive docids.
#[inline]
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
self.bitset.iter()
}
/// Get underlying bitset
#[inline]
pub fn bitset(&self) -> &ReadSerializedBitSet {
&self.bitset
}
/// The number of deleted docs
pub fn num_alive_docs(&self) -> usize {
self.num_alive_docs
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.num_bytes
}
}
#[cfg(test)]
mod tests {
use super::AliveBitSet;
#[test]
fn test_alive_bitset_empty() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
assert!(!alive_bitset.is_deleted(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 10);
}
#[test]
fn test_alive_bitset() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
assert!(alive_bitset.is_alive(0));
assert!(alive_bitset.is_deleted(1));
assert!(alive_bitset.is_alive(2));
assert!(alive_bitset.is_alive(3));
assert!(alive_bitset.is_alive(4));
assert!(alive_bitset.is_alive(5));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(7));
assert!(alive_bitset.is_alive(8));
assert!(alive_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 8);
}
#[test]
fn test_alive_bitset_iter_minimal() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
}
#[test]
fn test_alive_bitset_iter_small() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_alive_bitset_iter() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::AliveBitSet;
use rand::prelude::IteratorRandom;
use rand::thread_rng;
use test::Bencher;
fn get_alive() -> Vec<u32> {
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
for _ in 0..(1_000_000) * 1 / 8 {
remove_rand(&mut data);
}
data
}
fn remove_rand(raw: &mut Vec<u32>) {
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
raw.remove(i);
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
}

View File

@@ -1,143 +0,0 @@
use crate::common::{BitSet, HasLen};
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::WritePtr;
use crate::space_usage::ByteCount;
use crate::DocId;
use std::io;
use std::io::Write;
/// Write a delete `BitSet`
///
/// where `delete_bitset` is the set of deleted `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_delete_bitset(
delete_bitset: &BitSet,
max_doc: u32,
writer: &mut WritePtr,
) -> io::Result<()> {
let mut byte = 0u8;
let mut shift = 0u8;
for doc in 0..max_doc {
if delete_bitset.contains(doc) {
byte |= 1 << shift;
}
if shift == 7 {
writer.write_all(&[byte])?;
shift = 0;
byte = 0;
} else {
shift += 1;
}
}
if max_doc % 8 > 0 {
writer.write_all(&[byte])?;
}
Ok(())
}
/// Set of deleted `DocId`s.
#[derive(Clone)]
pub struct DeleteBitSet {
data: OwnedBytes,
num_deleted: usize,
}
impl DeleteBitSet {
#[cfg(test)]
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
use std::path::Path;
assert!(docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value(max_doc);
for &doc in docs {
bitset.insert(doc);
}
let directory = RamDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
wrt.terminate().unwrap();
let file = directory.open_read(path).unwrap();
Self::open(file).unwrap()
}
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
let bytes = file.read_bytes()?;
let num_deleted: usize = bytes
.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
Ok(DeleteBitSet {
data: bytes,
num_deleted,
})
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
pub fn is_alive(&self, doc: DocId) -> bool {
!self.is_deleted(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = self.data.as_slice()[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.num_deleted
}
}
#[cfg(test)]
mod tests {
use super::DeleteBitSet;
use crate::common::HasLen;
#[test]
fn test_delete_bitset_empty() {
let delete_bitset = DeleteBitSet::for_test(&[], 10);
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 0);
}
#[test]
fn test_delete_bitset() {
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10);
assert!(delete_bitset.is_alive(0));
assert!(delete_bitset.is_deleted(1));
assert!(delete_bitset.is_alive(2));
assert!(delete_bitset.is_alive(3));
assert!(delete_bitset.is_alive(4));
assert!(delete_bitset.is_alive(5));
assert!(delete_bitset.is_alive(6));
assert!(delete_bitset.is_alive(6));
assert!(delete_bitset.is_alive(7));
assert!(delete_bitset.is_alive(8));
assert!(delete_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 2);
}
}

View File

@@ -23,9 +23,9 @@ values stored.
Read access performance is comparable to that of an array lookup.
*/
pub use self::alive_bitset::write_alive_bitset;
pub use self::alive_bitset::AliveBitSet;
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
@@ -40,14 +40,14 @@ pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::Cardinality;
use crate::schema::FieldType;
use crate::schema::Value;
use crate::DocId;
use crate::{
chrono::{NaiveDateTime, Utc},
schema::Type,
};
use crate::{common, DocId};
mod alive_bitset;
mod bytes;
mod delete;
mod error;
mod facet_reader;
mod multivalued;
@@ -213,8 +213,7 @@ fn value_to_u64(value: &Value) -> u64 {
mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::Field;
@@ -222,6 +221,7 @@ mod tests {
use crate::schema::FAST;
use crate::schema::{Document, IntOptions};
use crate::{Index, SegmentId, SegmentReader};
use common::HasLen;
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -588,7 +588,7 @@ mod bench {
use super::tests::FIELD;
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use std::collections::HashMap;

View File

@@ -8,14 +8,22 @@ pub use self::writer::MultiValuedFastFieldWriter;
mod tests {
use crate::collector::TopDocs;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::Cardinality;
use crate::schema::Facet;
use crate::schema::IntOptions;
use crate::schema::Schema;
use crate::schema::INDEXED;
use crate::Document;
use crate::Index;
use crate::Term;
use chrono::Duration;
use futures::executor::block_on;
use proptest::prop_oneof;
use proptest::proptest;
use proptest::strategy::Strategy;
use test_env_log::test;
#[test]
fn test_multivalued_u64() {
@@ -225,6 +233,111 @@ mod tests {
multi_value_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
}
fn test_multivalued_no_panic(ops: &[IndexingOp]) {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
match id % 3 {
0 => {
index_writer.add_document(doc!());
}
1 => {
let mut doc = Document::new();
for _ in 0..5001 {
doc.add_u64(field, id as u64);
}
index_writer.add_document(doc);
}
_ => {
let mut doc = Document::new();
doc.add_u64(field, id as u64);
index_writer.add_document(doc);
}
};
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(field, id as u64));
}
IndexingOp::Commit => {
index_writer.commit().unwrap();
}
IndexingOp::Merge => {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
}
}
}
}
assert!(index_writer.commit().is_ok());
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if !segment_ids.is_empty() {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
}
}
#[derive(Debug, Clone, Copy)]
enum IndexingOp {
AddDoc { id: u32 },
DeleteDoc { id: u32 },
Commit,
Merge,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u32..10u32).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u32..10u32).prop_map(|id| IndexingOp::AddDoc { id }),
(0u32..2u32).prop_map(|_| IndexingOp::Commit),
(0u32..1u32).prop_map(|_| IndexingOp::Merge),
]
}
proptest! {
#[test]
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
test_multivalued_no_panic(&ops[..]);
}
}
#[test]
fn test_multivalued_proptest_off_by_one_bug_1151() {
use IndexingOp::*;
let ops = [
AddDoc { id: 3 },
AddDoc { id: 1 },
AddDoc { id: 3 },
Commit,
Merge,
];
test_multivalued_no_panic(&ops[..]);
}
#[test]
#[ignore]
fn test_many_facets() {

View File

@@ -1,6 +1,5 @@
use super::FastValue;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
@@ -8,6 +7,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;

View File

@@ -1,4 +1,4 @@
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};

View File

@@ -1,8 +1,8 @@
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::schema::Field;
use common::BinarySerializable;
use common::CountingWriter;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
@@ -105,9 +105,7 @@ impl CompositeFastFieldSerializer {
&fastfield_accessor,
&mut estimations,
);
if let Some(broken_estimation) = estimations
.iter()
.find(|estimation| estimation.0 == f32::NAN)
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{
warn!(
"broken estimation for fast field codec {}",

View File

@@ -1,12 +1,12 @@
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::FastFieldDataAccess;
use crate::common;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use common;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;

View File

@@ -1,5 +1,5 @@
use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::schema::Field;

View File

@@ -1,4 +1,4 @@
use crate::common::CompositeWrite;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::schema::Field;
use std::io;

View File

@@ -1,4 +1,8 @@
use crate::schema;
use crate::Index;
use crate::IndexSettings;
use crate::IndexSortByField;
use crate::Order;
use crate::Searcher;
use crate::{doc, schema::*};
use rand::thread_rng;
@@ -35,7 +39,7 @@ fn test_functional_store() -> crate::Result<()> {
let mut doc_set: Vec<u64> = Vec::new();
let mut doc_id = 0u64;
for iteration in 0..500 {
for iteration in 0..get_num_iterations() {
dbg!(iteration);
let num_docs: usize = rng.gen_range(0..4);
if !doc_set.is_empty() {
@@ -56,16 +60,37 @@ fn test_functional_store() -> crate::Result<()> {
Ok(())
}
fn get_num_iterations() -> usize {
std::env::var("NUM_FUNCTIONAL_TEST_ITERATIONS")
.map(|str| str.parse().unwrap())
.unwrap_or(2000)
}
#[test]
#[ignore]
fn test_functional_indexing() -> crate::Result<()> {
fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Desc,
}),
..Default::default()
});
let index = index_builder.create_from_tempdir().unwrap();
let reader = index.reader()?;
let mut rng = thread_rng();
@@ -75,7 +100,7 @@ fn test_functional_indexing() -> crate::Result<()> {
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
@@ -98,6 +123,84 @@ fn test_functional_indexing() -> crate::Result<()> {
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
doc.add_text(text_field, get_text());
index_writer.add_document(doc);
}
}
Ok(())
}
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.";
fn get_text() -> String {
use rand::seq::SliceRandom;
let mut rng = thread_rng();
let tokens: Vec<_> = LOREM.split(' ').collect();
let random_val = rng.gen_range(0..20);
(0..random_val)
.map(|_| tokens.choose(&mut rng).unwrap())
.cloned()
.collect::<Vec<_>>()
.join(" ")
}
#[test]
#[ignore]
fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload()?;
let searcher = reader.searcher();
// check that everything is correct.
check_index_content(
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
doc.add_text(text_field, get_text());
index_writer.add_document(doc);
}
}

View File

@@ -2,23 +2,23 @@
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
//!
use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
use super::SegmentWriter;
use crate::{
schema::{Field, Schema},
DocId, IndexSortByField, Order, TantivyError,
DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError,
};
use std::{cmp::Reverse, ops::Index};
/// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)]
pub(crate) struct SegmentDocidMapping<'a> {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
pub(crate) struct SegmentDocidMapping {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool,
}
impl<'a> SegmentDocidMapping<'a> {
impl SegmentDocidMapping {
pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool,
) -> Self {
Self {
@@ -26,7 +26,7 @@ impl<'a> SegmentDocidMapping<'a> {
is_trivial,
}
}
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentOrdinal)> {
self.new_doc_id_to_old_and_segment.iter()
}
pub(crate) fn len(&self) -> usize {
@@ -40,15 +40,15 @@ impl<'a> SegmentDocidMapping<'a> {
self.is_trivial
}
}
impl<'a> Index<usize> for SegmentDocidMapping<'a> {
type Output = (DocId, SegmentReaderWithOrdinal<'a>);
impl Index<usize> for SegmentDocidMapping {
type Output = (DocId, SegmentOrdinal);
fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx]
}
}
impl<'a> IntoIterator for SegmentDocidMapping<'a> {
type Item = (DocId, SegmentReaderWithOrdinal<'a>);
impl IntoIterator for SegmentDocidMapping {
type Item = (DocId, SegmentOrdinal);
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {

View File

@@ -1,7 +1,6 @@
use super::operation::{AddOperation, UserOperation};
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use crate::common::BitSet;
use crate::core::Index;
use crate::core::Segment;
use crate::core::SegmentComponent;
@@ -12,7 +11,7 @@ use crate::directory::TerminatingWrite;
use crate::directory::{DirectoryLock, GarbageCollectionResult};
use crate::docset::{DocSet, TERMINATED};
use crate::error::TantivyError;
use crate::fastfield::write_delete_bitset;
use crate::fastfield::write_alive_bitset;
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
use crate::indexer::operation::DeleteOperation;
@@ -24,14 +23,18 @@ use crate::schema::Document;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Opstamp;
use common::BitSet;
use crossbeam::channel;
use futures::executor::block_on;
use futures::future::Future;
use smallvec::smallvec;
use smallvec::SmallVec;
use wasm_mt_pool::pool_exec;
use wasm_mt::prelude::*;
use std::mem;
use std::ops::Range;
use std::sync::Arc;
use wasm_mt_pool::prelude::*;
use std::thread;
use std::thread::JoinHandle;
@@ -75,7 +78,7 @@ pub struct IndexWriter {
heap_size_in_bytes_per_thread: usize,
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
workers_join_handle: Vec<JoinHandle<Result<JsValue, JsValue>>>,
operation_receiver: OperationReceiver,
operation_sender: OperationSender,
@@ -90,10 +93,12 @@ pub struct IndexWriter {
stamper: Stamper,
committed_opstamp: Opstamp,
worker_pool: wasm_mt_pool::ThreadPool,
}
fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
alive_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
@@ -114,7 +119,7 @@ fn compute_deleted_bitset(
let mut doc_matching_deleted_term = docset.doc();
while doc_matching_deleted_term != TERMINATED {
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
delete_bitset.insert(doc_matching_deleted_term);
alive_bitset.remove(doc_matching_deleted_term);
might_have_changed = true;
}
doc_matching_deleted_term = docset.advance();
@@ -141,7 +146,7 @@ pub(crate) fn advance_deletes(
return Ok(());
}
if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
if segment_entry.alive_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
// There has been no `DeleteOperation` between the segment status and `target_opstamp`.
return Ok(());
}
@@ -149,15 +154,15 @@ pub(crate) fn advance_deletes(
let segment_reader = SegmentReader::open(&segment)?;
let max_doc = segment_reader.max_doc();
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
None => BitSet::with_max_value(max_doc),
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
Some(previous_alive_bitset) => (*previous_alive_bitset).clone(),
None => BitSet::with_max_value_and_full(max_doc),
};
let num_deleted_docs_before = segment.meta().num_deleted_docs();
compute_deleted_bitset(
&mut delete_bitset,
&mut alive_bitset,
&segment_reader,
segment_entry.delete_cursor(),
&DocToOpstampMapping::None,
@@ -167,20 +172,21 @@ pub(crate) fn advance_deletes(
// TODO optimize
// It should be possible to do something smarter by manipulation bitsets directly
// to compute this union.
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() {
for doc in 0u32..max_doc {
if seg_delete_bitset.is_deleted(doc) {
delete_bitset.insert(doc);
if seg_alive_bitset.is_deleted(doc) {
alive_bitset.remove(doc);
}
}
}
let num_deleted_docs: u32 = delete_bitset.len() as u32;
let num_alive_docs: u32 = alive_bitset.len() as u32;
let num_deleted_docs = max_doc - num_alive_docs;
if num_deleted_docs > num_deleted_docs_before {
// There are new deletes. We need to write a new delete file.
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
write_alive_bitset(&alive_bitset, &mut delete_file)?;
delete_file.terminate()?;
}
@@ -226,13 +232,12 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc);
let delete_bitset_opt =
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore();
// update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, delete_bitset_opt);
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
block_on(segment_updater.schedule_add_segment(segment_entry))?;
Ok(true)
}
@@ -259,7 +264,7 @@ fn apply_deletes(
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_max_value(max_doc);
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
@@ -318,6 +323,7 @@ impl IndexWriter {
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let worker_pool = block_on(wasm_mt_pool::ThreadPool::new(num_threads, crate::PKG_JS).and_init()).unwrap();
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
@@ -338,6 +344,7 @@ impl IndexWriter {
stamper,
worker_id: 0,
worker_pool,
};
index_writer.start_workers()?;
Ok(index_writer)
@@ -348,6 +355,11 @@ impl IndexWriter {
self.operation_sender = sender;
}
/// Accessor to the index.
pub fn index(&self) -> &Index {
&self.index
}
/// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> crate::Result<()> {
@@ -406,9 +418,8 @@ impl IndexWriter {
let mem_budget = self.heap_size_in_bytes_per_thread;
let index = self.index.clone();
let join_handle: JoinHandle<crate::Result<()>> = thread::Builder::new()
.name(format!("thrd-tantivy-index{}", self.worker_id))
.spawn(move || {
let join_handle: JoinHandle<crate::Result<_>> = pool_exec!(self.worker_pool,
move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
@@ -849,7 +860,7 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 2);
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
assert!(index_writer.commit().is_ok());
@@ -857,7 +868,7 @@ mod tests {
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
@@ -869,7 +880,7 @@ mod tests {
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
@@ -1361,6 +1372,7 @@ mod tests {
AddDoc { id: u64 },
DeleteDoc { id: u64 },
Commit,
Merge,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
@@ -1368,6 +1380,7 @@ mod tests {
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
}
@@ -1393,7 +1406,7 @@ mod tests {
fn test_operation_strategy(
ops: &[IndexingOp],
sort_index: bool,
force_merge: bool,
force_end_merge: bool,
) -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
@@ -1435,6 +1448,8 @@ mod tests {
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
@@ -1448,12 +1463,21 @@ mod tests {
IndexingOp::Commit => {
index_writer.commit()?;
}
IndexingOp::Merge => {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
}
}
}
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
if force_merge {
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
@@ -1500,7 +1524,7 @@ mod tests {
for segment_reader in searcher.segment_readers().iter() {
let store_reader = segment_reader.get_store_reader().unwrap();
// test store iterator
for doc in store_reader.iter(segment_reader.delete_bitset()) {
for doc in store_reader.iter(segment_reader.alive_bitset()) {
let id = doc
.unwrap()
.get_first(id_field)
@@ -1626,7 +1650,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.max_doc(), 2);
assert_eq!(segment_reader.num_deleted_docs(), 1);
assert_eq!(segment_reader.num_docs(), 1);
Ok(())
}

View File

@@ -1,10 +1,10 @@
use crate::error::DataCorruption;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldDataAccess;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldStats;
use crate::fastfield::MultiValueLength;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
@@ -19,9 +19,8 @@ use crate::schema::{Field, Schema};
use crate::store::StoreWriter;
use crate::termdict::TermMerger;
use crate::termdict::TermOrdinal;
use crate::IndexSettings;
use crate::IndexSortByField;
use crate::{common::HasLen, fastfield::MultiValueLength};
use crate::{common::MAX_DOC_LIMIT, IndexSettings};
use crate::{core::Segment, indexer::doc_id_mapping::expect_field_id_for_sort_field};
use crate::{core::SegmentReader, Order};
use crate::{
@@ -36,6 +35,11 @@ use std::collections::HashMap;
use std::sync::Arc;
use tantivy_bitpacker::minmax;
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
@@ -63,58 +67,33 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::R
.sum::<u64>())
}
/// `ReaderWithOrdinal` is used to be able to easier associate
/// data with a `SegmentReader`. The ordinal is supposed to be
/// used as an index access.
///
/// The ordinal identifies the position within `Merger` readers.
#[derive(Clone, Copy)]
pub(crate) struct SegmentReaderWithOrdinal<'a> {
pub reader: &'a SegmentReader,
pub ordinal: SegmentOrdinal,
}
impl<'a> From<(usize, &'a SegmentReader)> for SegmentReaderWithOrdinal<'a> {
fn from(data: (usize, &'a SegmentReader)) -> Self {
SegmentReaderWithOrdinal {
reader: data.1,
ordinal: data.0 as u32,
}
}
}
pub struct IndexMerger {
index_settings: IndexSettings,
schema: Schema,
readers: Vec<SegmentReader>,
pub(crate) readers: Vec<SegmentReader>,
max_doc: u32,
}
fn compute_min_max_val(
u64_reader: &impl FastFieldReader<u64>,
max_doc: DocId,
delete_bitset_opt: Option<&DeleteBitSet>,
segment_reader: &SegmentReader,
) -> Option<(u64, u64)> {
if max_doc == 0 {
None
} else {
match delete_bitset_opt {
Some(delete_bitset) => {
// some deleted documents,
// we need to recompute the max / min
minmax(
(0..max_doc)
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
.map(|doc_id| u64_reader.get(doc_id)),
)
}
None => {
// no deleted documents,
// we can use the previous min_val, max_val.
Some((u64_reader.min_value(), u64_reader.max_value()))
}
}
if segment_reader.max_doc() == 0 {
return None;
}
if segment_reader.alive_bitset().is_none() {
// no deleted documents,
// we can use the previous min_val, max_val.
return Some((u64_reader.min_value(), u64_reader.max_value()));
}
// some deleted documents,
// we need to recompute the max / min
minmax(
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get(doc_id)),
)
}
struct TermOrdinalMapping {
@@ -144,7 +123,7 @@ impl TermOrdinalMapping {
.iter()
.flat_map(|term_ordinals| term_ordinals.iter().cloned().max())
.max()
.unwrap_or_else(TermOrdinal::default)
.unwrap_or_default()
}
}
@@ -246,8 +225,8 @@ impl IndexMerger {
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for (doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let fieldnorms_reader = &fieldnorms_readers[reader_with_ordinal.ordinal as usize];
for (doc_id, reader_ordinal) in doc_id_mapping.iter() {
let fieldnorms_reader = &fieldnorms_readers[*reader_ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
@@ -320,7 +299,7 @@ impl IndexMerger {
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
compute_min_max_val(&u64_reader, reader)
})
.flatten()
.reduce(|a, b| {
@@ -346,25 +325,25 @@ impl IndexMerger {
};
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocidMapping<'a>,
doc_id_mapping: &'a SegmentDocidMapping,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
let (doc_id, reader_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_ordinal as usize].get(doc_id)
}
}
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
};
let iter1 = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
let iter1 = doc_id_mapping.iter().map(|(doc_id, reader_ordinal)| {
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
fast_field_reader.get(*doc_id)
});
let iter2 = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
let iter2 = doc_id_mapping.iter().map(|(doc_id, reader_ordinal)| {
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
@@ -384,9 +363,10 @@ impl IndexMerger {
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<bool> {
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
let reader_ordinal_and_field_accessors =
self.get_reader_with_sort_field_accessor(sort_by_field)?;
let everything_is_in_order = reader_and_field_accessors
let everything_is_in_order = reader_ordinal_and_field_accessors
.into_iter()
.map(|reader| reader.1)
.tuple_windows()
@@ -412,24 +392,21 @@ impl IndexMerger {
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
&'a self,
sort_by_field: &'b IndexSortByField,
) -> crate::Result<
Vec<(
SegmentReaderWithOrdinal<'a>,
impl FastFieldReader<u64> + Clone,
)>,
> {
let reader_and_field_accessors = self
) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> {
let reader_ordinal_and_field_accessors = self
.readers
.iter()
.enumerate()
.map(Into::into)
.map(|reader_with_ordinal: SegmentReaderWithOrdinal| {
let value_accessor =
Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?;
Ok((reader_with_ordinal, value_accessor))
.map(|(reader_ordinal, _)| reader_ordinal as SegmentOrdinal)
.map(|reader_ordinal: SegmentOrdinal| {
let value_accessor = Self::get_sort_field_accessor(
&self.readers[reader_ordinal as usize],
sort_by_field,
)?;
Ok((reader_ordinal, value_accessor))
})
.collect::<crate::Result<Vec<_>>>()?;
Ok(reader_and_field_accessors)
Ok(reader_ordinal_and_field_accessors)
}
/// Generates the doc_id mapping where position in the vec=new
@@ -440,50 +417,54 @@ impl IndexMerger {
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<SegmentDocidMapping> {
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
let reader_ordinal_and_field_accessors =
self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression
// create iterators over segment/sort_accessor/doc_id tuple
let doc_id_reader_pair =
reader_and_field_accessors
reader_ordinal_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
let reader = &self.readers[reader_and_field_accessor.0 as usize];
reader.doc_ids_alive().map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
});
let total_num_new_docs = self
.readers
.iter()
.map(|reader| reader.num_docs() as usize)
.sum();
let mut sorted_doc_ids = Vec::with_capacity(total_num_new_docs);
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
.into_iter()
.kmerge_by(|a, b| {
let val1 = a.2.get(a.0);
let val2 = b.2.get(b.0);
if sort_by_field.order == Order::Asc {
val1 < val2
} else {
val1 > val2
}
})
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id))
.collect::<Vec<_>>();
sorted_doc_ids.extend(
doc_id_reader_pair
.into_iter()
.kmerge_by(|a, b| {
let val1 = a.2.get(a.0);
let val2 = b.2.get(b.0);
if sort_by_field.order == Order::Asc {
val1 < val2
} else {
val1 > val2
}
})
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id)),
);
Ok(SegmentDocidMapping::new(sorted_doc_ids, false))
}
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
// `MultiValuedFastFieldReader`
//
// Important: reader_and_field_accessor needs
// to have the same order as self.readers since ReaderWithOrdinal
// is used to index the reader_and_field_accessors vec.
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
@@ -495,25 +476,24 @@ impl IndexMerger {
//
// This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking.
let mut idx_num_vals = 0;
let mut num_docs = 0;
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
if let Some(delete_bitset) = reader.delete_bitset() {
idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64;
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = u64s_reader.get_len(doc) as u64;
total_num_vals += num_vals;
}
if let Some(alive_bitset) = reader.alive_bitset() {
num_docs += alive_bitset.num_alive_docs() as u64;
for doc in reader.doc_ids_alive() {
let num_vals = u64s_reader.get_len(doc) as u64;
total_num_vals += num_vals;
}
} else {
idx_num_vals += reader.max_doc() as u64;
num_docs += reader.max_doc() as u64;
total_num_vals += u64s_reader.get_total_len();
}
}
let stats = FastFieldStats {
max_value: total_num_vals,
num_vals: idx_num_vals,
// The fastfield offset index contains (num_docs + 1) values.
num_vals: num_docs + 1,
min_value: 0,
};
// We can now create our `idx` serializer, and in a second pass,
@@ -524,10 +504,10 @@ impl IndexMerger {
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offsets = Vec::with_capacity(doc_id_mapping.len());
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping.iter() {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
let reader = &reader_and_field_accessors[*reader as usize].1;
offsets.push(offset);
offset += reader.get_len(*doc_id) as u64;
}
@@ -549,7 +529,7 @@ impl IndexMerger {
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Vec<u64>> {
let reader_and_field_accessors = self.readers.iter().map(|reader|{
let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
.typed_fast_field_multi_reader(field)
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
@@ -560,7 +540,7 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&reader_and_field_accessors,
&reader_ordinal_and_field_accessors,
)
}
@@ -599,11 +579,11 @@ impl IndexMerger {
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals = Vec::with_capacity(100);
for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
for (old_doc_id, reader_ordinal) in doc_id_mapping.iter() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(reader_with_ordinal.ordinal as usize);
term_ordinal_mappings.get_segment(*reader_ordinal as usize);
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
let ff_reader = &fast_field_reader[*reader_ordinal as usize];
ff_reader.get_vals(*old_doc_id, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
@@ -619,21 +599,25 @@ impl IndexMerger {
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
/// sorting and the others
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocidMapping> {
let mapping: Vec<_> = self
let total_num_new_docs = self
.readers
.iter()
.enumerate()
.map(|(ordinal, reader)| {
let reader_with_ordinal = SegmentReaderWithOrdinal {
ordinal: ordinal as u32,
reader,
};
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_with_ordinal))
})
.flatten()
.collect();
.map(|reader| reader.num_docs() as usize)
.sum();
let mut mapping = Vec::with_capacity(total_num_new_docs);
mapping.extend(
self.readers
.iter()
.enumerate()
.map(|(reader_ordinal, reader)| {
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal))
})
.flatten(),
);
Ok(SegmentDocidMapping::new(mapping, true))
}
fn write_multi_fast_field(
@@ -697,7 +681,7 @@ impl IndexMerger {
};
struct SortedDocidMultiValueAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocidMapping<'a>,
doc_id_mapping: &'a SegmentDocidMapping,
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
offsets: Vec<u64>,
}
@@ -716,13 +700,11 @@ impl IndexMerger {
let num_pos_covered_until_now = self.offsets[new_docid];
let pos_in_values = pos - num_pos_covered_until_now;
let (old_doc_id, reader_with_ordinal) = self.doc_id_mapping[new_docid as usize];
let num_vals = self.fast_field_readers[reader_with_ordinal.ordinal as usize]
.get_len(old_doc_id);
let (old_doc_id, reader_ordinal) = self.doc_id_mapping[new_docid as usize];
let num_vals = self.fast_field_readers[reader_ordinal as usize].get_len(old_doc_id);
assert!(num_vals >= pos_in_values);
let mut vals = vec![];
self.fast_field_readers[reader_with_ordinal.ordinal as usize]
.get_vals(old_doc_id, &mut vals);
self.fast_field_readers[reader_ordinal as usize].get_vals(old_doc_id, &mut vals);
vals[pos_in_values as usize]
}
@@ -734,8 +716,8 @@ impl IndexMerger {
};
let iter1 = doc_id_mapping
.iter()
.map(|(doc_id, reader_with_ordinal)| {
let ff_reader = &ff_readers[reader_with_ordinal.ordinal as usize];
.map(|(doc_id, reader_ordinal)| {
let ff_reader = &ff_readers[*reader_ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
@@ -743,8 +725,8 @@ impl IndexMerger {
.flatten();
let iter2 = doc_id_mapping
.iter()
.map(|(doc_id, reader_with_ordinal)| {
let ff_reader = &ff_readers[reader_with_ordinal.ordinal as usize];
.map(|(doc_id, reader_ordinal)| {
let ff_reader = &ff_readers[*reader_ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
@@ -786,8 +768,8 @@ impl IndexMerger {
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1);
for (doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let bytes_reader = &reader_and_field_accessors[reader_with_ordinal.ordinal as usize].1;
for (doc_id, reader_ordinal) in doc_id_mapping.iter() {
let bytes_reader = &reader_and_field_accessors[*reader_ordinal as usize].1;
let val = bytes_reader.get_bytes(*doc_id);
serialize_vals.write_all(val)?;
}
@@ -841,8 +823,8 @@ impl IndexMerger {
segment_local_map
})
.collect();
for (new_doc_id, (old_doc_id, segment_and_ordinal)) in doc_id_mapping.iter().enumerate() {
let segment_map = &mut merged_doc_id_map[segment_and_ordinal.ordinal as usize];
for (new_doc_id, (old_doc_id, segment_ordinal)) in doc_id_mapping.iter().enumerate() {
let segment_map = &mut merged_doc_id_map[*segment_ordinal as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}
@@ -889,9 +871,9 @@ impl IndexMerger {
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
let segment_postings = inverted_index
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
let delete_bitset_opt = segment_reader.delete_bitset();
let doc_freq = if let Some(delete_bitset) = delete_bitset_opt {
segment_postings.doc_freq_given_deletes(delete_bitset)
let alive_bitset_opt = segment_reader.alive_bitset();
let doc_freq = if let Some(alive_bitset) = alive_bitset_opt {
segment_postings.doc_freq_given_deletes(alive_bitset)
} else {
segment_postings.doc_freq()
};
@@ -958,12 +940,13 @@ impl IndexMerger {
}
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
for (doc_id, term_freq, positions) in &doc_id_and_positions {
field_serializer.write_doc(*doc_id, *term_freq, positions);
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
}
doc_id_and_positions.clear();
}
// closing the term.
field_serializer.close_term()?;
}
@@ -1010,11 +993,11 @@ impl IndexMerger {
let mut document_iterators: Vec<_> = store_readers
.iter()
.enumerate()
.map(|(i, store)| store.iter_raw(self.readers[i].delete_bitset()))
.map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
.collect();
if !doc_id_mapping.is_trivial() {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
for (old_doc_id, reader_ordinal) in doc_id_mapping.iter() {
let doc_bytes_it = &mut document_iterators[*reader_ordinal as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
@@ -1029,7 +1012,7 @@ impl IndexMerger {
} else {
for reader in &self.readers {
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0
if reader.has_deletes()
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.
@@ -1046,7 +1029,7 @@ impl IndexMerger {
|| store_reader.block_checkpoints().take(7).count() < 6
|| store_reader.compressor() != store_writer.compressor()
{
for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
@@ -2074,4 +2057,11 @@ mod tests {
Ok(())
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
}

View File

@@ -1,6 +1,7 @@
#[cfg(test)]
mod tests {
use crate::fastfield::FastFieldReader;
use crate::fastfield::{AliveBitSet, FastFieldReader};
use crate::schema::IndexRecordOption;
use crate::{
collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},
@@ -16,7 +17,7 @@ mod tests {
schema::{self, BytesOptions},
DocAddress,
};
use crate::{IndexSettings, Term};
use crate::{DocSet, IndexSettings, Postings, Term};
use futures::executor::block_on;
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
@@ -104,9 +105,11 @@ mod tests {
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
);
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme"));
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64),
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
);
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
);
assert!(index_writer.commit().is_ok());
@@ -118,7 +121,7 @@ mod tests {
} else {
1
};
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")));
assert!(index_writer.commit().is_ok());
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values {
@@ -243,6 +246,36 @@ mod tests {
assert_eq!(do_search("biggest"), vec![0]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
// access doc store
{
let blubber_pos = if force_disjunct_segment_sort_values {
@@ -260,6 +293,69 @@ mod tests {
}
}
#[test]
fn test_merge_unsorted_index() {
let index = create_test_index(
Some(IndexSettings {
..Default::default()
}),
false,
);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![1]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![4]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
@@ -314,7 +410,7 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
@@ -339,6 +435,34 @@ mod tests {
assert_eq!(do_search("biggest"), vec![5]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
@@ -422,8 +546,9 @@ mod bench_sorted_index_merge {
let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
let u64_reader: DynamicFastFieldReader<u64> = reader.reader
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)|{
let reader = &merger.readers[*ordinal as usize];
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");

View File

@@ -1,7 +1,7 @@
use crate::common::BitSet;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::indexer::delete_queue::DeleteCursor;
use common::BitSet;
use std::fmt;
/// A segment entry describes the state of
@@ -9,18 +9,16 @@ use std::fmt;
///
/// In addition to segment `meta`,
/// it contains a few transient states
/// - `state` expresses whether the segment is already in the
/// middle of a merge
/// - `delete_bitset` is a bitset describing
/// documents that were deleted during the commit
/// - `alive_bitset` is a bitset describing
/// documents that were alive during the commit
/// itself.
/// - `delete_cursor` is the position in the delete queue.
/// Deletes happening before the cursor are reflected either
/// in the .del file or in the `delete_bitset`.
/// in the .del file or in the `alive_bitset`.
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
delete_bitset: Option<BitSet>,
alive_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
}
@@ -29,11 +27,11 @@ impl SegmentEntry {
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
alive_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
delete_bitset,
alive_bitset,
delete_cursor,
}
}
@@ -41,8 +39,8 @@ impl SegmentEntry {
/// Return a reference to the segment entry deleted bitset.
///
/// `DocId` in this bitset are flagged as deleted.
pub fn delete_bitset(&self) -> Option<&BitSet> {
self.delete_bitset.as_ref()
pub fn alive_bitset(&self) -> Option<&BitSet> {
self.alive_bitset.as_ref()
}
/// Set the `SegmentMeta` for this segment.

View File

@@ -11,6 +11,8 @@
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
#![feature(async_closure)]
//! # `tantivy`
//!
//! Tantivy is a search engine library.
@@ -126,6 +128,8 @@ mod macros;
pub use crate::error::TantivyError;
pub use chrono;
pub const PKG_JS: &'static str = "./pkg/pool_exec.js"; // path to `wasm-bindgen`'s JS binding
/// Tantivy result.
///
/// Within tantivy, please avoid importing `Result` using `use crate::Result`
@@ -135,7 +139,6 @@ pub type Result<T> = std::result::Result<T, TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common;
mod core;
mod indexer;
@@ -163,8 +166,6 @@ pub use self::snippet::{Snippet, SnippetGenerator};
mod docset;
pub use self::docset::{DocSet, TERMINATED};
pub use crate::common::HasLen;
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
@@ -178,6 +179,8 @@ pub use crate::indexer::IndexWriter;
pub use crate::postings::Postings;
pub use crate::reader::LeasedItem;
pub use crate::schema::{Document, Term};
pub use common::HasLen;
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use std::fmt;
use once_cell::sync::Lazy;
@@ -293,7 +296,7 @@ pub struct DocAddress {
}
#[cfg(test)]
mod tests {
pub mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
@@ -304,11 +307,18 @@ mod tests {
use crate::Index;
use crate::Postings;
use crate::ReloadPolicy;
use common::{BinarySerializable, FixedSize};
use rand::distributions::Bernoulli;
use rand::distributions::Uniform;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
/// Checks if left and right are close one to each other.
/// Panics if the two values are more than 0.5% apart.
#[macro_export]
@@ -993,8 +1003,24 @@ mod tests {
#[test]
fn test_validate_checksum() -> crate::Result<()> {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let mut builder = Schema::builder();
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?;
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"));
writer.add_document(doc!(body => "boo"));
}
writer.commit()?;
assert!(index.validate_checksum()?.is_empty());
// delete few docs
writer.delete_term(Term::from_field_text(body, "foo"));
writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
let _ = futures::executor::block_on(writer.merge(&segment_ids));
assert!(index.validate_checksum()?.is_empty());
Ok(())
}

View File

@@ -1,9 +1,9 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::OwnedBytes;
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::{BlockDecoder, VIntDecoder};
use common::{BinarySerializable, VInt};
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.

View File

@@ -1,7 +1,7 @@
use crate::common::{BinarySerializable, CountingWriter, VInt};
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::BlockEncoder;
use crate::postings::compression::VIntEncoder;
use common::{BinarySerializable, CountingWriter, VInt};
use std::io::{self, Write};
/// The PositionSerializer is in charge of serializing all of the positions

View File

@@ -1,241 +1,109 @@
use std::ops::Range;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::AlignedBuffer;
unsafe fn binary_search_step(ptr: *const u32, target: u32, half_size: isize) -> *const u32 {
let mid = ptr.offset(half_size);
if *mid < target {
mid.offset(1)
} else {
ptr
}
}
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
/// Search the first index containing an element greater or equal to
/// the target.
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
unsafe {
let ptr = arr as *const AlignedBuffer as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search_sse2_128_u32() {
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
for el in 0u32..128u32 {
block[el as usize] = (el * 2 + 1) << 18;
}
let target = block[64] + 1;
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return begin..pivot;
}
begin = pivot;
}
begin..end
}
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize {
let range = exponential_search(block_docs, target);
range.start + linear_search(&block_docs[range], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
Sse2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the block regardless of whether the block is full or not.
///
/// Indeed, if the block is not full, the remaining items are TERMINATED.
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
if self == BlockSearcher::Sse2 {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
galloping(&block_docs.0[..], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::Sse2;
}
}
BlockSearcher::Scalar
/// the `start` argument is just used to hint that the response is
/// greater than beyond `start`. the implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// - The block is sorted. Some elements may appear several times. This is the case at the
/// end of the last block for instance.
/// - The target is assumed smaller or equal to the last element of the block.
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
let start_ptr: *const u32 = &arr[0] as *const u32;
unsafe {
let mut ptr = start_ptr;
ptr = binary_search_step(ptr, target, 63);
ptr = binary_search_step(ptr, target, 31);
ptr = binary_search_step(ptr, target, 15);
ptr = binary_search_step(ptr, target, 7);
ptr = binary_search_step(ptr, target, 3);
ptr = binary_search_step(ptr, target, 1);
let extra = if *ptr < target { 1 } else { 0 };
(ptr.offset_from(start_ptr) as usize) + extra
}
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
use super::branchless_binary_search;
use crate::docset::TERMINATED;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), 0..1);
assert_eq!(exponential_search(&[1, 2], 1), 0..1);
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
3..7
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
assert_eq!(
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
cursor
);
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use proptest::prelude::*;
use std::collections::HashSet;
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
fn util_test_search_in_block(block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(cursor < COMPRESSION_BLOCK_SIZE);
assert!(block[cursor] >= target);
if cursor > 0 {
assert!(block[cursor - 1] < target);
}
assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
}
fn util_test_search_in_block_all(block: &[u32]) {
let mut targets = HashSet::new();
targets.insert(0);
for &val in block {
if val > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block, target);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
fn test_search_in_branchless_binary_search() {
let v: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE).map(|i| i as u32 * 2).collect();
util_test_search_in_block_all(&v[..]);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::Sse2);
fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
let mut el = 0;
for i in 0..COMPRESSION_BLOCK_SIZE {
el += deltas[i];
deltas[i] = el;
}
deltas
})
}
proptest! {
#[test]
fn test_proptest_branchless_binary_search(block in monotonous_block()) {
util_test_search_in_block_all(&block[..]);
}
}
}

View File

@@ -1,16 +1,14 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
};
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
use common::{BinarySerializable, VInt};
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
it.next().map(|first| it.fold(first, Score::max))
@@ -209,9 +207,9 @@ impl BlockSegmentPostings {
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_aligned()
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block.
@@ -349,7 +347,6 @@ impl BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::BlockSegmentPostings;
use crate::common::HasLen;
use crate::core::Index;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -360,6 +357,7 @@ mod tests {
use crate::schema::Term;
use crate::schema::INDEXED;
use crate::DocId;
use common::HasLen;
#[test]
fn test_empty_segment_postings() {

View File

@@ -1,5 +1,5 @@
use crate::common::FixedSize;
use bitpacking::{BitPacker, BitPacker4x};
use common::FixedSize;
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES;
@@ -49,16 +49,10 @@ impl BlockEncoder {
}
}
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
#[derive(Clone)]
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
#[derive(Clone)]
pub struct BlockDecoder {
bitpacker: BitPacker4x,
output: AlignedBuffer,
output: [u32; COMPRESSION_BLOCK_SIZE],
pub output_len: usize,
}
@@ -72,7 +66,7 @@ impl BlockDecoder {
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
bitpacker: BitPacker4x::new(),
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
output: [val; COMPRESSION_BLOCK_SIZE],
output_len: 0,
}
}
@@ -85,28 +79,28 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output.0, num_bits)
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(compressed_data, &mut self.output.0, num_bits)
.decompress(compressed_data, &mut self.output, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output.0[..self.output_len]
&self.output[..self.output_len]
}
#[inline]
pub(crate) fn output_aligned(&self) -> &AlignedBuffer {
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
&self.output
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output.0[idx]
self.output[idx]
}
}
@@ -190,8 +184,8 @@ impl VIntDecoder for BlockDecoder {
padding: u32,
) -> usize {
self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
self.output.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted(
@@ -201,12 +195,12 @@ impl VIntDecoder for BlockDecoder {
padding: u32,
) -> usize {
self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
self.output.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]) {
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output.0);
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output);
self.output_len = num_els;
}
}

View File

@@ -3,6 +3,9 @@ Postings module (also called inverted index)
*/
mod block_search;
pub(crate) use self::block_search::branchless_binary_search;
mod block_segment_postings;
pub(crate) mod compression;
mod postings;
@@ -14,7 +17,6 @@ mod skip;
mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub use self::block_segment_postings::BlockSegmentPostings;
pub use self::postings::Postings;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;

View File

@@ -11,7 +11,7 @@ use crate::docset::DocSet;
/// but other implementations mocking `SegmentPostings` exist,
/// for merging segments or for testing.
pub trait Postings: DocSet + 'static {
/// Returns the term frequency
/// The number of times the term appears in the document.
fn term_freq(&self) -> u32;
/// Returns the positions offseted with a given value.

View File

@@ -133,7 +133,8 @@ impl MultiFieldPostingsWriter {
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
self.term_index.iter().collect();
Vec::with_capacity(self.term_index.len());
term_offsets.extend(self.term_index.iter());
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =

View File

@@ -1,10 +1,8 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::FieldSerializer;
use crate::DocId;
use crate::{
common::{read_u32_vint, write_u32_vint},
indexer::doc_id_mapping::DocIdMapping,
};
use common::{read_u32_vint, write_u32_vint};
const POSITION_END: u32 = 0;

View File

@@ -1,12 +1,12 @@
use crate::common::HasLen;
use crate::docset::DocSet;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::branchless_binary_search;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::BlockSearcher;
use crate::postings::BlockSegmentPostings;
use crate::postings::Postings;
use crate::{DocId, TERMINATED};
use common::HasLen;
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
@@ -18,7 +18,6 @@ pub struct SegmentPostings {
pub(crate) block_cursor: BlockSegmentPostings,
cur: usize,
position_reader: Option<PositionReader>,
block_searcher: BlockSearcher,
}
impl SegmentPostings {
@@ -28,7 +27,6 @@ impl SegmentPostings {
block_cursor: BlockSegmentPostings::empty(),
cur: 0,
position_reader: None,
block_searcher: BlockSearcher::default(),
}
}
@@ -36,7 +34,7 @@ impl SegmentPostings {
///
/// This method will clone and scan through the posting lists.
/// (this is a rather expensive operation).
pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 {
pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 {
let mut docset = self.clone();
let mut doc_freq = 0;
loop {
@@ -44,7 +42,7 @@ impl SegmentPostings {
if doc == TERMINATED {
return doc_freq;
}
if delete_bitset.is_alive(doc) {
if alive_bitset.is_alive(doc) {
doc_freq += 1u32;
}
docset.advance();
@@ -154,7 +152,6 @@ impl SegmentPostings {
block_cursor: segment_block_postings,
cur: 0, // cursor within the block
position_reader,
block_searcher: BlockSearcher::default(),
}
}
}
@@ -183,8 +180,8 @@ impl DocSet for SegmentPostings {
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.docs_aligned();
self.cur = self.block_searcher.search_in_block(output, target);
let output = self.block_cursor.full_block();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
@@ -197,7 +194,7 @@ impl DocSet for SegmentPostings {
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur];
let doc = output[self.cur];
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
@@ -268,10 +265,10 @@ impl Postings for SegmentPostings {
mod tests {
use super::SegmentPostings;
use crate::common::HasLen;
use common::HasLen;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::postings::postings::Postings;
#[test]
@@ -299,9 +296,10 @@ mod tests {
fn test_doc_freq() {
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
assert_eq!(docs.doc_freq(), 3);
let delete_bitset = DeleteBitSet::for_test(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2);
let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
let all_deleted =
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
}
}

View File

@@ -1,7 +1,6 @@
use super::TermInfo;
use crate::common::{BinarySerializable, VInt};
use crate::common::{CompositeWrite, CountingWriter};
use crate::core::Segment;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionSerializer;
@@ -12,6 +11,8 @@ use crate::schema::{Field, FieldEntry, FieldType};
use crate::schema::{IndexRecordOption, Schema};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{DocId, Score};
use common::CountingWriter;
use common::{BinarySerializable, VInt};
use std::cmp::Ordering;
use std::io::{self, Write};
@@ -442,10 +443,8 @@ impl<W: Write> PostingsSerializer<W> {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
self.output_write.write_all(skip_data)?;
self.output_write.write_all(&self.postings_write[..])?;
} else {
self.output_write.write_all(&self.postings_write[..])?;
}
self.output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;

View File

@@ -148,6 +148,10 @@ impl TermHashMap {
unordered_term_id
}
pub fn len(&self) -> usize {
self.len
}
pub fn iter(&self) -> Iter<'_> {
Iter {
inner: self.occupied.iter(),

View File

@@ -1,4 +1,4 @@
use crate::common::{BinarySerializable, FixedSize};
use common::{BinarySerializable, FixedSize};
use std::io;
use std::iter::ExactSizeIterator;
use std::ops::Range;
@@ -67,7 +67,7 @@ impl BinarySerializable for TermInfo {
mod tests {
use super::TermInfo;
use crate::common::test::fixed_size_test;
use crate::tests::fixed_size_test;
// TODO add serialize/deserialize test for terminfo

View File

@@ -1,4 +1,3 @@
use crate::common::BitSet;
use crate::core::SegmentReader;
use crate::query::ConstScorer;
use crate::query::{BitSetDocSet, Explanation};
@@ -7,6 +6,7 @@ use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::TantivyError;
use crate::{DocId, Score};
use common::BitSet;
use std::io;
use std::sync::Arc;
use tantivy_fst::Automaton;
@@ -121,10 +121,7 @@ mod tests {
}
fn is_match(&self, state: &Self::State) -> bool {
match *state {
State::AfterA => true,
_ => false,
}
matches!(*state, State::AfterA)
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {

View File

@@ -1,6 +1,6 @@
use crate::common::{BitSet, TinySet};
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
use common::{BitSet, TinySet};
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
///
@@ -96,10 +96,13 @@ impl DocSet for BitSetDocSet {
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;
use super::BitSetDocSet;
use crate::common::BitSet;
use crate::docset::{DocSet, TERMINATED};
use crate::tests::generate_nonunique_unsorted;
use crate::DocId;
use common::BitSet;
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
let mut docset = BitSet::with_max_value(max_doc);
@@ -109,6 +112,29 @@ mod tests {
BitSetDocSet::from(docset)
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {
assert!(remaining);
assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
}
assert!(!remaining);
}
#[test]
fn test_empty() {
let bitset = BitSet::with_max_value(1000);

View File

@@ -310,7 +310,7 @@ mod tests {
));
let query = BooleanQuery::from(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32))?;
assert_nearly_equals!(explanation.value(), 0.6931472);
assert_nearly_equals!(explanation.value(), std::f32::consts::LN_2);
Ok(())
}
}

View File

@@ -1,4 +1,4 @@
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
@@ -118,8 +118,8 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
self.underlying.size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
self.underlying.count(delete_bitset)
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
self.underlying.count(alive_bitset)
}
fn count_including_deleted(&mut self) -> u32 {

View File

@@ -1,4 +1,3 @@
use crate::common::BitSet;
use crate::core::Searcher;
use crate::core::SegmentReader;
use crate::error::TantivyError;
@@ -10,6 +9,7 @@ use crate::schema::Type;
use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DocId, Score};
use common::BitSet;
use std::io;
use std::ops::{Bound, Range};

View File

@@ -10,6 +10,9 @@ use tantivy_fst::Regex;
/// containing a specific term that matches
/// a regex pattern.
///
/// Wildcard queries (e.g. ho*se) can be achieved
/// by converting them to their regex counterparts.
///
/// ```rust
/// use tantivy::collector::Count;
/// use tantivy::query::RegexQuery;

View File

@@ -40,8 +40,8 @@ impl Weight for TermWeight {
}
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
if let Some(delete_bitset) = reader.delete_bitset() {
Ok(self.scorer(reader, 1.0)?.count(delete_bitset))
if let Some(alive_bitset) = reader.alive_bitset() {
Ok(self.scorer(reader, 1.0)?.count(alive_bitset))
} else {
let field = self.term.field();
let inv_index = reader.inverted_index(field)?;

View File

@@ -1,9 +1,9 @@
use crate::common::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::Scorer;
use crate::DocId;
use crate::Score;
use common::TinySet;
const HORIZON_NUM_TINYBITSETS: usize = 64;
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;

View File

@@ -1,8 +1,8 @@
#![allow(dead_code)]
use crate::common::HasLen;
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
use common::HasLen;
/// Simulate a `Postings` objects from a `VecPostings`.
/// `VecPostings` only exist for testing purposes.

View File

@@ -59,8 +59,8 @@ pub trait Weight: Send + Sync + 'static {
/// Returns the number documents within the given `SegmentReader`.
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
let mut scorer = self.scorer(reader, 1.0)?;
if let Some(delete_bitset) = reader.delete_bitset() {
Ok(scorer.count(delete_bitset))
if let Some(alive_bitset) = reader.alive_bitset() {
Ok(scorer.count(alive_bitset))
} else {
Ok(scorer.count_including_deleted())
}

View File

@@ -1,8 +1,8 @@
use super::*;
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
use common::BinarySerializable;
use common::VInt;
use std::io::{self, Read, Write};
use std::mem;

View File

@@ -1,4 +1,4 @@
use crate::common::BinarySerializable;
use common::BinarySerializable;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Deserializer, Serialize, Serializer};

View File

@@ -1,4 +1,4 @@
use crate::common::BinarySerializable;
use common::BinarySerializable;
use std::io;
use std::io::Read;
use std::io::Write;

View File

@@ -109,17 +109,11 @@ impl FieldEntry {
&self.field_type
}
/// Returns true iff the field is indexed
/// Returns true iff the field is indexed.
///
/// An indexed field is searchable.
pub fn is_indexed(&self) -> bool {
match self.field_type {
FieldType::Str(ref options) => options.get_indexing_options().is_some(),
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options) => options.is_indexed(),
FieldType::HierarchicalFacet(ref options) => options.is_indexed(),
FieldType::Bytes(ref options) => options.is_indexed(),
}
self.field_type.is_indexed()
}
/// Returns true iff the field is a int (signed or unsigned) fast field

View File

@@ -1,6 +1,6 @@
use crate::common::BinarySerializable;
use crate::schema::Field;
use crate::schema::Value;
use common::BinarySerializable;
use std::io::{self, Read, Write};
/// `FieldValue` holds together a `Field` and its `Value`.

View File

@@ -20,7 +20,7 @@ pub const STORED: SchemaFlagList<StoredFlag, ()> = SchemaFlagList {
#[derive(Clone)]
pub struct IndexedFlag;
/// Flag to mark the field as indexed.
/// Flag to mark the field as indexed. An indexed field is searchable.
///
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
/// Of course, text fields can also be indexed... But this is expressed by using either the

View File

@@ -29,7 +29,7 @@ impl IntOptions {
self.stored
}
/// Returns true iff the value is indexed.
/// Returns true iff the value is indexed and therefore searchable.
pub fn is_indexed(&self) -> bool {
self.indexed
}
@@ -52,6 +52,8 @@ impl IntOptions {
///
/// Setting an integer as indexed will generate
/// a posting list for each value taken by the integer.
///
/// This is required for the field to be searchable.
pub fn set_indexed(mut self) -> IntOptions {
self.indexed = true;
self

View File

@@ -157,7 +157,7 @@ pub use self::int_options::IntOptions;
/// A field name can be any character, must have at least one character
/// and must not start with a `-`.
pub fn is_valid_field_name(field_name: &str) -> bool {
field_name.len() > 0 && !field_name.starts_with('-')
!field_name.is_empty() && !field_name.starts_with('-')
}
#[cfg(test)]

View File

@@ -1,9 +1,9 @@
use std::fmt;
use super::Field;
use crate::common;
use crate::schema::Facet;
use crate::DateTime;
use common;
use std::str;
/// Size (in bytes) of the buffer of a int field.

View File

@@ -94,7 +94,7 @@ impl TextFieldIndexing {
}
}
/// The field will be untokenized and indexed
/// The field will be untokenized and indexed.
pub const STRING: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("raw"),
@@ -103,7 +103,7 @@ pub const STRING: TextOptions = TextOptions {
stored: false,
};
/// The field will be tokenized and indexed
/// The field will be tokenized and indexed.
pub const TEXT: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),

View File

@@ -276,10 +276,10 @@ impl From<PreTokenizedString> for Value {
mod binary_serialize {
use super::Value;
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString;
use chrono::{TimeZone, Utc};
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use std::io::{self, Read, Write};
const TEXT_CODE: u8 = 0;

View File

@@ -12,7 +12,7 @@ pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>
unsafe {
compressed.set_len(maximum_ouput_size + 4);
}
let bytes_written = compress_into(uncompressed, compressed, 4)
let bytes_written = compress_into(uncompressed, &mut compressed[4..])
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
let num_bytes = uncompressed.len() as u32;
compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes());
@@ -35,7 +35,7 @@ pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<(
unsafe {
decompressed.set_len(uncompressed_size);
}
let bytes_written = decompress_into(&compressed[4..], decompressed, 0)
let bytes_written = decompress_into(&compressed[4..], decompressed)
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
if bytes_written != uncompressed_size {
return Err(io::Error::new(

View File

@@ -1,8 +1,5 @@
use crate::{
common::{BinarySerializable, FixedSize, HasLen},
directory::FileSlice,
store::Compressor,
};
use crate::{directory::FileSlice, store::Compressor};
use common::{BinarySerializable, FixedSize, HasLen};
use std::io;
#[derive(Debug, Clone, PartialEq)]

View File

@@ -1,6 +1,6 @@
use crate::common::VInt;
use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD};
use crate::DocId;
use common::VInt;
use std::io;
use std::ops::Range;

View File

@@ -1,8 +1,8 @@
use crate::common::{BinarySerializable, VInt};
use crate::directory::OwnedBytes;
use crate::store::index::block::CheckpointBlock;
use crate::store::index::Checkpoint;
use crate::DocId;
use common::{BinarySerializable, VInt};
pub struct LayerCursor<'a> {
remaining: &'a [u8],

View File

@@ -1,6 +1,6 @@
use crate::common::{BinarySerializable, VInt};
use crate::store::index::block::CheckpointBlock;
use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD};
use common::{BinarySerializable, VInt};
use std::io;
use std::io::Write;

View File

@@ -57,7 +57,7 @@ pub mod tests {
use futures::executor::block_on;
use super::*;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT};
use crate::schema::{Document, TextOptions};
use crate::{
@@ -113,7 +113,8 @@ pub mod tests {
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
// this will cover deletion of the first element in a checkpoint
let deleted_docids = (200..300).collect::<Vec<_>>();
let delete_bitset = DeleteBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
let alive_bitset =
AliveBitSet::for_test_from_deleted_docs(&deleted_docids, NUM_DOCS as u32);
let path = Path::new("store");
let directory = RamDirectory::create();
@@ -134,7 +135,7 @@ pub mod tests {
);
}
for (_, doc) in store.iter(Some(&delete_bitset)).enumerate() {
for (_, doc) in store.iter(Some(&alive_bitset)).enumerate() {
let doc = doc?;
let title_content = doc.get_first(field_title).unwrap().text().unwrap();
if !title_content.starts_with("Doc ") {
@@ -146,7 +147,7 @@ pub mod tests {
.unwrap()
.parse::<u32>()
.unwrap();
if delete_bitset.is_deleted(id) {
if alive_bitset.is_deleted(id) {
panic!("unexpected deleted document {}", id);
}
}
@@ -230,7 +231,7 @@ pub mod tests {
let searcher = index.reader().unwrap().searcher();
let reader = searcher.segment_reader(0);
let store = reader.get_store_reader().unwrap();
for doc in store.iter(reader.delete_bitset()) {
for doc in store.iter(reader.alive_bitset()) {
assert_eq!(
*doc?.get_first(text_field).unwrap().text().unwrap(),
"deletemenot".to_string()
@@ -288,7 +289,7 @@ pub mod tests {
let reader = searcher.segment_readers().iter().last().unwrap();
let store = reader.get_store_reader().unwrap();
for doc in store.iter(reader.delete_bitset()).take(50) {
for doc in store.iter(reader.alive_bitset()).take(50) {
assert_eq!(
*doc?.get_first(text_field).unwrap().text().unwrap(),
LOREM.to_string()

View File

@@ -5,11 +5,8 @@ use crate::schema::Document;
use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint;
use crate::DocId;
use crate::{
common::{BinarySerializable, HasLen, VInt},
error::DataCorruption,
fastfield::DeleteBitSet,
};
use crate::{error::DataCorruption, fastfield::AliveBitSet};
use common::{BinarySerializable, HasLen, VInt};
use lru::LruCache;
use std::io;
use std::sync::atomic::{AtomicUsize, Ordering};
@@ -136,12 +133,12 @@ impl StoreReader {
/// Iterator over all Documents in their order as they are stored in the doc store.
/// Use this, if you want to extract all Documents from the doc store.
/// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong.
/// The alive_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong.
pub fn iter<'a: 'b, 'b>(
&'b self,
delete_bitset: Option<&'a DeleteBitSet>,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
self.iter_raw(delete_bitset).map(|doc_bytes_res| {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
Ok(Document::deserialize(&mut doc_bytes)?)
})
@@ -149,10 +146,10 @@ impl StoreReader {
/// Iterator over all RawDocuments in their order as they are stored in the doc store.
/// Use this, if you want to extract all Documents from the doc store.
/// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong.
/// The alive_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong.
pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self,
delete_bitset: Option<&'a DeleteBitSet>,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
let last_docid = self
.block_checkpoints()
@@ -182,7 +179,7 @@ impl StoreReader {
num_skipped = 0;
}
let alive = delete_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
if alive {
let ret = Some((curr_block.clone(), num_skipped, reset_block_pos));
// the map block will move over the num_skipped, so we reset to 0

View File

@@ -1,13 +1,13 @@
use super::index::SkipIndexBuilder;
use super::StoreReader;
use super::{compressors::Compressor, footer::DocStoreFooter};
use crate::common::CountingWriter;
use crate::common::{BinarySerializable, VInt};
use crate::directory::TerminatingWrite;
use crate::directory::WritePtr;
use crate::schema::Document;
use crate::store::index::Checkpoint;
use crate::DocId;
use common::CountingWriter;
use common::{BinarySerializable, VInt};
use std::io::{self, Write};
const BLOCK_SIZE: usize = 16_384;

View File

@@ -1,8 +1,8 @@
use crate::common::{BinarySerializable, FixedSize};
use crate::directory::{FileSlice, OwnedBytes};
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use byteorder::{ByteOrder, LittleEndian};
use common::{BinarySerializable, FixedSize};
use std::cmp;
use std::io::{self, Read, Write};
use tantivy_bitpacker::compute_num_bits;
@@ -290,16 +290,16 @@ mod tests {
use super::extract_bits;
use super::TermInfoBlockMeta;
use super::{TermInfoStore, TermInfoStoreWriter};
use crate::common;
use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::postings::TermInfo;
use common;
use common::BinarySerializable;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
#[test]
fn test_term_info_block() {
common::test::fixed_size_test::<TermInfoBlockMeta>();
crate::tests::fixed_size_test::<TermInfoBlockMeta>();
}
#[test]

View File

@@ -1,10 +1,10 @@
use super::term_info_store::{TermInfoStore, TermInfoStoreWriter};
use super::{TermStreamer, TermStreamerBuilder};
use crate::common::{BinarySerializable, CountingWriter};
use crate::directory::{FileSlice, OwnedBytes};
use crate::error::DataCorruption;
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use common::{BinarySerializable, CountingWriter};
use once_cell::sync::Lazy;
use std::io::{self, Write};
use tantivy_fst::raw::Fst;

View File

@@ -61,3 +61,31 @@ impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{AlphaNumOnlyFilter, SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_alphanum_only() {
let tokens = token_stream_helper("I am a cat. 我輩は猫である。(1906)");
assert_eq!(tokens.len(), 5);
assert_token(&tokens[0], 0, "I", 0, 1);
assert_token(&tokens[1], 1, "am", 2, 4);
assert_token(&tokens[2], 2, "a", 5, 6);
assert_token(&tokens[3], 3, "cat", 7, 10);
assert_token(&tokens[4], 5, "1906", 37, 41);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

View File

@@ -56,31 +56,30 @@ impl<'a> TokenStream for LowerCaserTokenStream<'a> {
#[cfg(test)]
mod tests {
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_to_lower_case() {
assert_eq!(
lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]
);
let tokens = token_stream_helper("Tree");
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "tree", 0, 4);
let tokens = token_stream_helper("Русский текст");
assert_eq!(tokens.len(), 2);
assert_token(&tokens[0], 0, "русский", 0, 14);
assert_token(&tokens[1], 1, "текст", 15, 25);
}
fn lowercase_helper(text: &str) -> Vec<String> {
let mut tokens = vec![];
fn token_stream_helper(text: &str) -> Vec<Token> {
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
.token_stream(text);
while token_stream.advance() {
let token_text = token_stream.token().text.clone();
tokens.push(token_text);
}
let mut tokens = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
}
}

View File

@@ -131,6 +131,7 @@ mod token_stream_chain;
mod tokenized_string;
mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
@@ -143,6 +144,7 @@ pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{
@@ -277,4 +279,25 @@ pub mod tests {
assert!(tokens.is_empty());
}
}
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
ws_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello,", 0, 6);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer!", 17, 23);
}
}

View File

@@ -42,3 +42,27 @@ impl TokenStream for RawTokenStream {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{RawTokenizer, TextAnalyzer, Token};
#[test]
fn test_raw_tokenizer() {
let tokens = token_stream_helper("Hello, happy tax payer!");
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(RawTokenizer);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

Some files were not shown because too many files have changed in this diff Show More