Compare commits

...

90 Commits

Author SHA1 Message Date
Paul Masurel
dce6adc4b6 Revert "add index accessor for index writer (#1159)"
This reverts commit b256df6599.
2021-09-23 21:49:34 +09:00
Mestery
b256df6599 add index accessor for index writer (#1159)
* add index accessor for index writer

* Update src/indexer/index_writer.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>
2021-09-23 21:49:20 +09:00
dependabot[bot]
37c5fe3c86 Update memmap2 requirement from 0.4 to 0.5 (#1157)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Release notes](https://github.com/RazrFalcon/memmap2-rs/releases)
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v0.4.0...v0.5.0)

---
updated-dependencies:
- dependency-name: memmap2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-09-23 20:18:27 +09:00
dependabot[bot]
2c78b31aab Update memmap2 requirement from 0.3 to 0.4 (#1155)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Release notes](https://github.com/RazrFalcon/memmap2-rs/releases)
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v.0.3.0...v0.4.0)
2021-09-17 08:52:52 +09:00
Paul Masurel
46b86a7976 Bounced version and edited changelog 2021-09-10 23:05:09 +09:00
PSeitz
3bc177e69d fix #1151 (#1152)
* fix #1151

Fixes a off by one error in the stats for the index fast field in the multi value fast field.
When retrieving the data range for a docid, `get(doc)..get(docid+1)` is requested. On creation
the num_vals statistic was set to doc instead of docid + 1. In the multivaluelinearinterpol fast
field the last value was therefore not serialized (and would return 0 instead in most cases).
So the last document get(lastdoc)..get(lastdoc + 1) would return the invalid range `value..0`.

This PR adds a proptest to cover this scenario. A combination of a large number values, since multilinear
interpolation is only active for more than 5_000 values, and a merge is required.
2021-09-10 23:00:37 +09:00
PSeitz
319609e9c1 test cargo-llvm-cov (#1149) 2021-09-03 22:00:43 +09:00
Kanji Yomoda
9d87b89718 Fix incorrect comment for Index::create_in_dir (#1148)
* Fix incorrect comment for Index::create_in_dir
2021-09-03 10:37:16 +09:00
Tomoko Uchida
dd81e38e53 Add WhitespaceTokenizer (#1147)
* Add WhitespaceTokenizer.
2021-08-29 18:20:49 +09:00
Paul Masurel
9f32b22602 Preparing for release. 2021-08-26 09:07:08 +09:00
sigaloid
096ce7488e Resolve some clippys, format (#1144)
* cargo +nightly clippy --fix -Z unstable-options
2021-08-26 08:46:00 +09:00
PSeitz
a1782dd172 Update index_sorting.md 2021-08-25 07:55:50 +01:00
PSeitz
000d76b11a Update index_sorting.md 2021-08-24 19:28:06 +01:00
PSeitz
abd29f6646 Update index_sorting.md 2021-08-24 19:26:19 +01:00
PSeitz
b4ecf0ab2f Merge pull request #1146 from tantivy-search/sorting_doc
add sorting to book
2021-08-23 17:37:54 +01:00
Pascal Seitz
798f7dbf67 add sorting to book 2021-08-23 17:36:41 +01:00
PSeitz
06a2e47c8d Merge pull request #1145 from tantivy-search/blub2
cargo fmt
2021-08-21 18:52:50 +01:00
Pascal Seitz
e0b83eb291 cargo fmt 2021-08-21 18:52:10 +01:00
PSeitz
13401f46ea add wildcard mention 2021-08-21 18:10:33 +01:00
PSeitz
1a45b030dc Merge pull request #1141 from tantivy-search/tantivy_common
dissolve common module
2021-08-20 08:03:37 +01:00
Pascal Seitz
62052bcc2d add missing test function
closes #1139
2021-08-20 07:26:22 +01:00
Pascal Seitz
3265f7bec3 dissolve common module 2021-08-19 23:26:34 +01:00
Pascal Seitz
ee0881712a move bitset to common crate, move composite file to directory 2021-08-19 17:45:09 +01:00
PSeitz
483e0336b6 Merge pull request #1140 from tantivy-search/tantivy_common
rename common to tantivy-common
2021-08-19 13:02:54 +01:00
Pascal Seitz
3e8f267e33 rename common to tantivy-common 2021-08-19 10:27:20 +01:00
Paul Masurel
3b247fd968 Version bump 2021-08-19 10:12:30 +09:00
Paul Masurel
750f6e6479 Removed obsolete unit test (#1138) 2021-08-19 10:07:49 +09:00
Evance Soumaoro
5b475e6603 Checksum validation using active files (#1130)
* now validate checksum uses segment files not managed files
2021-08-19 10:03:20 +09:00
PSeitz
0ca7f73dc5 add docs badge, fix build badge 2021-08-13 19:40:33 +01:00
PSeitz
47ed18845e Merge pull request #1136 from tantivy-search/minor_fixes
more docs detail
2021-08-13 18:11:47 +01:00
Pascal Seitz
dc141cdb29 more docs detail
remove code duplicate
2021-08-13 17:40:13 +01:00
PSeitz
f6cf6e889b Merge pull request #1133 from tantivy-search/merge_overflow
test doc_freq and term_freq in sorted index
2021-08-05 07:53:46 +01:00
Pascal Seitz
f379a80233 test doc_freq and term_freq in sorted index 2021-08-03 11:38:05 +01:00
PSeitz
4a320fd1ff fix delta position in merge and index sorting (#1132)
fixes #1125
2021-08-03 18:06:36 +09:00
PSeitz
85d23e8e3b Merge pull request #1129 from tantivy-search/merge_overflow
add long running test in ci
2021-08-02 15:54:31 +01:00
Pascal Seitz
022ab9d298 don't run as pr 2021-08-02 15:44:00 +01:00
Pascal Seitz
605e8603dc add positions to long running test 2021-08-02 15:29:49 +01:00
Pascal Seitz
70f160b329 add long running test in ci 2021-08-02 11:35:39 +01:00
PSeitz
6d265e6bed fix gh action name 2021-08-02 10:38:01 +01:00
PSeitz
fdc512391b Merge pull request #1128 from tantivy-search/merge_overflow
add sort to functional test, add env for iterations
2021-08-02 10:29:16 +01:00
Pascal Seitz
108714c934 add sort to functional test, add env for iterations 2021-08-02 10:11:17 +01:00
Paul Masurel
44e8cf98a5 Cargo fmt 2021-07-30 15:30:01 +09:00
Paul Masurel
f0ee69d9e9 Remove the complicated block search logic for a simpler branchless (#1124)
binary search

The code is simpler and faster.

Before
test postings::bench::bench_segment_intersection                                                                         ... bench:   2,093,697 ns/iter (+/- 115,509)
test postings::bench::bench_skip_next_p01                                                                                ... bench:      58,585 ns/iter (+/- 796)
test postings::bench::bench_skip_next_p1                                                                                 ... bench:     160,872 ns/iter (+/- 5,164)
test postings::bench::bench_skip_next_p10                                                                                ... bench:     615,229 ns/iter (+/- 25,108)
test postings::bench::bench_skip_next_p90                                                                                ... bench:   1,120,509 ns/iter (+/- 22,271)

After
test postings::bench::bench_segment_intersection                                                                         ... bench:   1,747,726 ns/iter (+/- 52,867)
test postings::bench::bench_skip_next_p01                                                                                ... bench:      55,205 ns/iter (+/- 714)
test postings::bench::bench_skip_next_p1                                                                                 ... bench:     131,433 ns/iter (+/- 2,814)
test postings::bench::bench_skip_next_p10                                                                                ... bench:     478,830 ns/iter (+/- 12,794)
test postings::bench::bench_skip_next_p90                                                                                ... bench:     931,082 ns/iter (+/- 31,468)
2021-07-30 14:38:42 +09:00
Evance Soumaoro
b8a10c8406 switched to memmap2-rs (#1120) 2021-07-27 18:40:41 +09:00
PSeitz
ff4813529e add comments on compression (#1119) 2021-07-26 22:54:22 +09:00
PSeitz
470bc18e9b Merge pull request #1118 from tantivy-search/remove_rand
move rand to optional dependencies
2021-07-21 18:01:22 +01:00
Pascal Seitz
0b1add0ec6 move rand to optional dependencies
closes #1117
2021-07-21 17:49:24 +01:00
François Massot
1db76dd9cf Merge pull request #1113 from shikhar/patch-1
stale comments in segment_reader.rs
2021-07-20 23:02:20 +02:00
François Massot
467a9517db Merge pull request #1114 from shikhar/patch-2
FilterCollector doc fix
2021-07-20 21:02:28 +02:00
Shikhar Bhushan
b361315a67 FilterCollector doc fix
Other types supported since https://github.com/tantivy-search/tantivy/pull/953/files
2021-07-15 22:55:47 -04:00
Shikhar Bhushan
4e3771bffc stale comments in segment_reader.rs 2021-07-15 22:47:32 -04:00
PSeitz
8176b0335a Merge pull request #1108 from PSeitz/pwnedbytes
move ownedbytes to own crate
2021-07-05 16:07:56 +02:00
Pascal Seitz
811ac98f36 more inlines 2021-07-05 15:49:42 +02:00
François Massot
f4b2e71800 Handle field names with any characters with a known set of special (#1109)
* Handle field names with any characters with a known set of special characters and an escape one

* Update field name validation rule to check only if it has at least one character and does not start with `-`

Closes #1087.
2021-07-05 22:31:36 +09:00
PSeitz
c431cfcf12 extend proptests, fix race condition (#1107)
* extend proptests, fix race condition
* cargo fmt
2021-07-05 18:28:56 +09:00
PSeitz
92f20bc5a2 use nightly image in coverage 2021-07-03 09:38:44 +02:00
PSeitz
57f931da3c Create coverage.yml 2021-07-03 09:35:07 +02:00
Pascal Seitz
9b662e6d03 move ownedbytes to own crate
fixes #1106
2021-07-02 16:51:59 +02:00
PSeitz
18377d949c Merge pull request #1105 from PSeitz/clippy
Fix clippy warnings
2021-07-02 10:01:19 +02:00
Pascal Seitz
e6427b2588 cleanup 2021-07-02 09:21:22 +02:00
Pascal Seitz
0062fe705d cargo fmt 2021-07-01 18:17:08 +02:00
Pascal Seitz
9b3e508753 fix clippy 2021-07-01 18:06:09 +02:00
Pascal Seitz
a1ac63ee1c fix clippy 2021-07-01 18:06:03 +02:00
Pascal Seitz
e496ae0470 clippy fixes 2021-07-01 17:43:50 +02:00
Pascal Seitz
1e4df54ab3 fix clippy 2021-07-01 17:41:53 +02:00
Pascal Seitz
2de249af74 clippy fixes 2021-07-01 17:37:37 +02:00
Pascal Seitz
10f056fbb4 apply clippy fixes 2021-07-01 17:08:44 +02:00
PSeitz
074b09d0c0 Merge pull request #1102 from PSeitz/proptests
extend proptests for sorting and merge
2021-07-01 16:23:53 +02:00
Pascal Seitz
86d0727659 add facet test
closes #1100
2021-07-01 15:36:17 +02:00
Pascal Seitz
be3e1b8718 cargo fmt 2021-07-01 14:02:09 +02:00
Pascal Seitz
8fdf59bdac add search test for proptest 2021-07-01 14:01:30 +02:00
Pascal Seitz
ebebce2102 cargo fmt 2021-07-01 10:47:20 +02:00
Pascal Seitz
8044ec38da test docstore in proptest 2021-07-01 10:15:42 +02:00
Pascal Seitz
7413f87265 use set instead of vec in proptest 2021-07-01 08:28:51 +02:00
PSeitz
aea2e77665 Merge pull request #1097 from PSeitz/multifastfield
Use dynamic fastfield codes for multivalues fixes #1093
2021-06-30 14:38:26 +02:00
Pascal Seitz
a15845f9fd add merge case to proptest, test multivalue fastfields
#1100
2021-06-30 13:13:33 +02:00
Pascal Seitz
94ac44df4f proptest with optional sorting 2021-06-30 12:06:03 +02:00
Pascal Seitz
f80d804a57 add random commits in proptest 2021-06-30 11:18:07 +02:00
Pascal Seitz
3b5c1d7817 use measure_time 0.7 2021-06-30 11:08:02 +02:00
Pascal Seitz
24274edf81 remove trait impl fpr &Vec 2021-06-30 09:50:47 +02:00
Paul Masurel
d58497529b Fixed CHANGELOG to include 0.15.2. 2021-06-30 16:34:47 +09:00
Pascal Seitz
130495abab cleanup 2021-06-30 08:57:55 +02:00
Pascal Seitz
9b743d60fb make docid mapping non optional
make docid mapping non optional
add trivial flag for docid mapping
add time measurements
2021-06-30 08:57:55 +02:00
Pascal Seitz
5c9e2ef036 wrap docidmapping in struct 2021-06-30 08:57:55 +02:00
Pascal Seitz
8526434b63 add dynamic fastfield case
add dynamic fastfield for single fast field unsorted
fix scary documentation bug
add num_len instead of len
2021-06-30 08:57:55 +02:00
Pascal Seitz
6ba302c481 Use dynamic fastfield codes for multivalues fixes #1093
Use dynamic fastfield codes for multivalues fixes (only sorting case covered)
Rename get to get_val due to conflict with Vec
use u64 precision instead of u32 for get_range, to allow use of existing fast field interface interface (actually not sure if it would be better have a different interface)
2021-06-30 08:57:55 +02:00
Paul Masurel
de92f094aa Closes #1101 fix delete documents with sort by field
Closes #1101

* fix delete documents with sort by field

Co-authored-by: Andre-Philippe Paquet <appaquet@gmail.com>
2021-06-30 15:51:32 +09:00
Evance Soumaoro
c82cee66de exposing min/max value interface on MultiValuedFastField Reader (#1096) 2021-06-23 17:38:50 +09:00
Paul Masurel
6eed05b1ce Revert "Exposing min/max value interface on MultiValuedFastField Reader (#1094)" (#1095)
This reverts commit bb488305c9.
2021-06-23 10:25:11 +09:00
Evance Soumaoro
bb488305c9 Exposing min/max value interface on MultiValuedFastField Reader (#1094)
Exposing min/max value interface on MultiValuedFastField Reader
2021-06-23 08:51:36 +09:00
139 changed files with 2659 additions and 1762 deletions

25
.github/workflows/coverage.yml vendored Normal file
View File

@@ -0,0 +1,25 @@
name: Coverage
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install Rust
run: rustup toolchain install nightly --component llvm-tools-preview
- name: Install cargo-llvm-cov
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
- name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info
fail_ci_if_error: true

24
.github/workflows/long_running.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
name: Rust
on:
push:
branches: [ main ]
env:
CARGO_TERM_COLOR: always
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
jobs:
functional_test_unsorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run indexing_unsorted
run: cargo test indexing_unsorted -- --ignored
functional_test_sorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run indexing_sorted
run: cargo test indexing_sorted -- --ignored

View File

@@ -10,7 +10,7 @@ env:
CARGO_TERM_COLOR: always
jobs:
build:
test:
runs-on: ubuntu-latest

View File

@@ -1,3 +1,21 @@
Tantivy 0.16.1
========================
- Major Bugfix on multivalued fastfield. #1151
Tantivy 0.16.0
=========================
- Bugfix in the filesum check. (@evanxg852000) #1127
- Bugfix in positions when the index is sorted by a field. (@appaquet) #1125
Tantivy 0.15.3
=========================
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
Tantivy 0.15.2
========================
- Major bugfix. DocStore still panics when a deleted doc is at the beginning of a block. (@appaquet) #1088
Tantivy 0.15.1
=========================
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.15.1"
version = "0.16.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -19,7 +19,7 @@ crc32fast = "1.2.1"
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true}
memmap2 = {version = "0.5", optional=true}
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true }
snap = { version = "1.0.5", optional = true }
@@ -31,12 +31,13 @@ num_cpus = "1.13"
fs2={ version = "0.4.3", optional = true }
levenshtein_automata = "0.2"
uuid = { version = "0.8.2", features = ["v4", "serde"] }
crossbeam = "0.8"
crossbeam = "0.8.1"
futures = { version = "0.3.15", features = ["thread-pool"] }
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version="0.1", path="./common" }
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.1", path="./ownedbytes" }
stable_deref_trait = "1.2"
rust-stemmers = "1.2"
downcast-rs = "1.2"
@@ -53,6 +54,7 @@ rayon = "1.5"
lru = "0.6.5"
fastdivide = "0.3"
itertools = "0.10.0"
measure_time = "0.7.0"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -62,7 +64,9 @@ rand = "0.8.3"
maplit = "1.0.2"
matches = "0.1.8"
proptest = "1.0"
criterion = "0.3.4"
criterion = "0.3.5"
test-env-log = "0.2.7"
env_logger = "0.9.0"
[dev-dependencies.fail]
version = "0.4"
@@ -79,7 +83,7 @@ overflow-checks = true
[features]
default = ["mmap", "lz4-compression" ]
mmap = ["fs2", "tempfile", "memmap"]
mmap = ["fs2", "tempfile", "memmap2"]
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
@@ -90,7 +94,7 @@ unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs"]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -1,9 +1,9 @@
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=main)](https://travis-ci.org/tantivy-search/tantivy)
[![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/)
[![Build Status](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml)
[![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/main?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)

View File

@@ -1,7 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &'static str = include_str!("alice.txt");
const ALICE_TXT: &str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.1.0"
version = "0.1.1"
edition = "2018"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -50,3 +50,32 @@ where
}
None
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}

View File

@@ -1,5 +1,5 @@
[package]
name = "common"
name = "tantivy-common"
version = "0.1.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
@@ -10,3 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies]
byteorder = "1.4.3"
[dev-dependencies]
proptest = "1.0.0"
rand = "0.8.4"

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::u64;
#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64);
pub struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -178,7 +178,7 @@ impl BitSet {
///
/// Reminder: the tiny set with the bucket `bucket`, represents the
/// elements from `bucket * 64` to `(bucket+1) * 64`.
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
self.tinysets[bucket as usize..]
.iter()
.cloned()
@@ -193,7 +193,7 @@ impl BitSet {
/// Returns the tiny bitset representing the
/// the set restricted to the number range from
/// `bucket * 64` to `(bucket + 1) * 64`.
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
pub fn tinyset(&self, bucket: u32) -> TinySet {
self.tinysets[bucket as usize]
}
}
@@ -203,11 +203,9 @@ mod tests {
use super::BitSet;
use super::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::BitSetDocSet;
use crate::tests;
use crate::tests::generate_nonunique_unsorted;
use std::collections::BTreeSet;
use rand::distributions::Bernoulli;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::collections::HashSet;
#[test]
@@ -263,29 +261,6 @@ mod tests {
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {
assert!(remaining);
assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
}
assert!(!remaining);
}
#[test]
fn test_bitset_num_buckets() {
use super::num_buckets;
@@ -340,10 +315,23 @@ mod tests {
assert_eq!(bitset.len(), 3);
}
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
StdRng::from_seed([seed_val; 32])
.sample_iter(&Bernoulli::new(ratio).unwrap())
.take(n as usize)
.enumerate()
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
.collect()
}
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
sample_with_seed(n, ratio, 4)
}
#[test]
fn test_bitset_clear() {
let mut bitset = BitSet::with_max_value(1_000);
let els = tests::sample(1_000, 0.01f64);
let els = sample(1_000, 0.01f64);
for &el in &els {
bitset.insert(el);
}

View File

@@ -1,9 +1,167 @@
use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod serialize;
mod vint;
mod writer;
pub use bitset::*;
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.deref().len()
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
})
}
#[cfg(test)]
pub mod test {
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use super::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
}

View File

@@ -106,7 +106,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf);
writer.write_all(&data)
writer.write_all(data)
}
impl VInt {
@@ -181,8 +181,8 @@ mod tests {
fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];
let num_bytes = VInt(val).serialize_into(&mut v);
for i in num_bytes..10 {
assert_eq!(v[i], 14u8);
for el in &v[num_bytes..10] {
assert_eq!(el, &14u8);
}
assert!(num_bytes > 0);
if num_bytes < 10 {

View File

@@ -7,6 +7,7 @@
- [Segments](./basis.md)
- [Defining your schema](./schema.md)
- [Facetting](./facetting.md)
- [Index Sorting](./index_sorting.md)
- [Innerworkings](./innerworkings.md)
- [Inverted index](./inverted_index.md)
- [Best practise](./inverted_index.md)

61
doc/src/index_sorting.md Normal file
View File

@@ -0,0 +1,61 @@
- [Index Sorting](#index-sorting)
+ [Why Sorting](#why-sorting)
* [Compression](#compression)
* [Top-N Optimization](#top-n-optimization)
* [Pruning](#pruning)
* [Other](#other)
+ [Usage](#usage)
# Index Sorting
Tantivy allows you to sort the index according to a property.
## Why Sorting
Presorting an index has several advantages:
###### Compression
When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5].
If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected.
###### Top-N Optimization
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
Note: Tantivy 0.16 does not do this optimization yet.
###### Pruning
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
Note: Tantivy 0.16 does not do this optimization yet.
###### Other?
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
```
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(settings);
let index = index_builder.create_in_ram().unwrap();
```
## Implementation details
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).

View File

@@ -86,12 +86,10 @@ impl Collector for StatsCollector {
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
let mut stats = Stats::default();
for segment_stats_opt in segment_stats {
if let Some(segment_stats) = segment_stats_opt {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
for segment_stats in segment_stats.into_iter().flatten() {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
Ok(stats.non_zero_count())
}
@@ -139,7 +137,7 @@ fn main() -> tantivy::Result<()> {
//
// Lets index a bunch of fake documents for the sake of
// this example.
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(

View File

@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(30_000_000)?;
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let query = BooleanQuery::new_multiterms_query(
facets
.iter()
.map(|key| Term::from_facet(ingredient, &key))
.map(|key| Term::from_facet(ingredient, key))
.collect(),
);
let top_docs_by_custom_score =

View File

@@ -22,7 +22,7 @@ fn main() -> tantivy::Result<()> {
let title = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"));

View File

@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = schema.parse_document(&short_man_json)?;
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc);

View File

@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema.clone())?;
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer = index.writer(50_000_000)?;

View File

@@ -1,4 +1,3 @@
use tantivy;
use tantivy::schema::*;
// # Document from json
@@ -22,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#;
// We can parse our document
let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?;
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
@@ -31,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818
}"#;
let _frankenstein_doc = schema.parse_document(&frankenstein_json)?;
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory.
//

View File

@@ -9,17 +9,16 @@ description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { path = "../common/" }
tantivy-bitpacker = { path = "../bitpacker/" }
common = { version = "0.1", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
prettytable-rs = {version="0.8.0", optional= true}
#prettytable-rs = {version="0.8.0" }
rand = "0.8.3"
rand = {version="0.8.3", optional= true}
[dev-dependencies]
more-asserts = "0.2.1"
rand = "0.8.3"
[features]
bin = ["prettytable-rs"]
bin = ["prettytable-rs", "rand"]
default = ["bin"]

View File

@@ -27,8 +27,7 @@ mod tests {
}
fn value_iter() -> impl Iterator<Item = u64> {
let data = (0..20_000).collect::<Vec<_>>();
data.into_iter()
0..20_000
}
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
b: &mut Bencher,
@@ -38,7 +37,7 @@ mod tests {
S::serialize(
&mut bytes,
&data,
stats_from_vec(&data),
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
@@ -56,7 +55,7 @@ mod tests {
S::serialize(
&mut bytes,
&data,
stats_from_vec(&data),
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)

View File

@@ -35,7 +35,7 @@ impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &data)
self.min_value_u64 + self.bit_unpacker.get(doc, data)
}
#[inline]
fn min_value(&self) -> u64 {
@@ -147,7 +147,7 @@ mod tests {
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
&data, name,
data, name,
);
}
@@ -165,7 +165,7 @@ mod tests {
fn bitpacked_fast_field_rand() {
for _ in 0..500 {
let mut data = (0..1 + rand::random::<u8>() as usize)
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
.map(|_| rand::random::<i64>() as u64 / 2)
.collect::<Vec<_>>();
create_and_validate(&data, "rand");

View File

@@ -51,14 +51,14 @@ pub trait FastFieldCodecSerializer {
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given document.
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: u32) -> u64;
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
}
#[derive(Debug, Clone)]
@@ -69,20 +69,14 @@ pub struct FastFieldStats {
}
impl<'a> FastFieldDataAccess for &'a [u64] {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
impl<'a> FastFieldDataAccess for &'a Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}
impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}
@@ -100,15 +94,15 @@ mod tests {
data: &[u64],
name: &str,
) -> (f32, f32) {
if !S::is_applicable(&data, crate::tests::stats_from_vec(&data)) {
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
let mut out = vec![];
S::serialize(
&mut out,
&data,
crate::tests::stats_from_vec(&data),
crate::tests::stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
@@ -125,7 +119,7 @@ mod tests {
}
}
let actual_compression = data.len() as f32 / out.len() as f32;
return (estimation, actual_compression);
(estimation, actual_compression)
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];

View File

@@ -78,7 +78,7 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
}
#[inline]
@@ -123,8 +123,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// calculate offset to ensure all values are positive
let mut offset = 0;
@@ -191,8 +191,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// let's sample at 0%, 5%, 10% .. 95%, 100%
@@ -205,7 +205,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
.iter()
.map(|pos| {
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
@@ -243,7 +243,7 @@ mod tests {
crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(&data, name);
>(data, name);
}
#[test]
@@ -285,9 +285,7 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
for _ in 0..5000 {
let mut data = (0..50 as usize)
.map(|_| rand::random::<u64>())
.collect::<Vec<_>>();
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();

View File

@@ -30,7 +30,7 @@ fn main() {
//.unwrap();
let best_compression_ratio_codec = results
.iter()
.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
.cloned()
.unwrap();
@@ -41,6 +41,7 @@ fn main() {
} else {
(est.to_string(), comp.to_string())
};
#[allow(clippy::all)]
let style = if comp == best_compression_ratio_codec.1 {
"Fb"
} else {
@@ -96,23 +97,23 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
data: &[u64],
) -> (bool, f32, f32, &'static str) {
let is_applicable = S::is_applicable(&data, stats_from_vec(&data));
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
}
let estimation = S::estimate(&data, stats_from_vec(&data));
let estimation = S::estimate(&data, stats_from_vec(data));
let mut out = vec![];
S::serialize(
&mut out,
&data,
stats_from_vec(&data),
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
return (true, estimation, actual_compression, S::NAME);
(true, estimation, actual_compression, S::NAME)
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -1,3 +1,17 @@
/*!
MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the offset, but in blocks of 512.
With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / 512 = 0,45 bits per element.
The additional space required per element in a block is the the maximum deviation of the linear interpolation estimation function.
E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
Size per block:
Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
*/
use crate::FastFieldCodecReader;
use crate::FastFieldCodecSerializer;
use crate::FastFieldDataAccess;
@@ -196,8 +210,8 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let mut first_function = Function {
end_pos: stats.num_vals,
@@ -309,9 +323,10 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get(0);
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
let slope = get_slope(
first_val_in_first_block,
last_val_in_first_block,
@@ -328,7 +343,7 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
.map(|pos| {
let calculated_value =
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
@@ -367,7 +382,7 @@ mod tests {
crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(&data, name);
>(data, name);
}
#[test]

11
ownedbytes/Cargo.toml Normal file
View File

@@ -0,0 +1,11 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.1.0"
edition = "2018"
description = "Expose data as static slice"
license = "MIT"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
stable_deref_trait = "1.2.0"

290
ownedbytes/src/lib.rs Normal file
View File

@@ -0,0 +1,290 @@
use stable_deref_trait::StableDeref;
use std::convert::TryInto;
use std::mem;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]
pub struct OwnedBytes {
data: &'static [u8],
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
}
impl OwnedBytes {
/// Creates an empty `OwnedBytes`.
pub fn empty() -> OwnedBytes {
OwnedBytes::new(&[][..])
}
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let bytes: &[u8] = box_stable_deref.as_ref();
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
OwnedBytes {
data,
box_stable_deref,
}
}
/// creates a fileslice that is just a view over a slice of the data.
pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes {
data: &self.data[range],
box_stable_deref: self.box_stable_deref.clone(),
}
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline]
pub fn as_slice(&self) -> &[u8] {
self.data
}
/// Returns the len of the slice.
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
///
/// This operation is cheap and does not require to copy any memory.
/// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped.
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline]
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
#[inline]
pub fn read_u8(&mut self) -> u8 {
assert!(!self.is_empty());
let byte = self.as_slice()[0];
self.advance(1);
byte
}
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
#[inline]
pub fn read_u64(&mut self) -> u64 {
assert!(self.len() > 7);
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
self.advance(8);
u64::from_le_bytes(octlet)
}
}
impl fmt::Debug for OwnedBytes {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We truncate the bytes in order to make sure the debug string
// is not too long.
let bytes_truncated: &[u8] = if self.len() > 8 {
&self.as_slice()[..10]
} else {
self.as_slice()
};
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
}
}
impl Deref for OwnedBytes {
type Target = [u8];
#[inline]
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
#[inline]
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
let read_len = self.read(buf)?;
if read_len != buf.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
));
}
Ok(())
}
}
impl AsRef<[u8]> for OwnedBytes {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};
use super::OwnedBytes;
#[test]
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{:?}", short_bytes),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{:?}", long_bytes),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}
#[test]
fn test_owned_bytes_read() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
{
let mut buf = [0u8; 5];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
}
{
let mut buf = [0u8; 2];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"fg");
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
}
Ok(())
}
#[test]
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 5];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
assert_eq!(&buf, b"abcde");
Ok(())
}
#[test]
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 7];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf[..5], b"abcde");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_to_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = Vec::new();
bytes.read_to_end(&mut buf)?;
assert_eq!(buf.as_slice(), b"abcde".as_ref());
Ok(())
}
#[test]
fn test_owned_bytes_read_u8() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
assert_eq!(bytes.read_u8(), 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_u64() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
assert_eq!(bytes.read_u64(), u64::MAX - 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_split() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
let (left, right) = bytes.split(3);
assert_eq!(left.as_slice(), b"abc");
assert_eq!(right.as_slice(), b"defghi");
}
#[test]
fn test_owned_bytes_split_boundary() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
{
let (left, right) = bytes.clone().split(0);
assert_eq!(left.as_slice(), b"");
assert_eq!(right.as_slice(), b"abcdefghi");
}
{
let (left, right) = bytes.split(9);
assert_eq!(left.as_slice(), b"abcdefghi");
assert_eq!(right.as_slice(), b"");
}
}
}

View File

@@ -14,3 +14,5 @@ edition = "2018"
[dependencies]
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }

View File

@@ -1,21 +1,44 @@
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use crate::Occur;
use combine::parser::char::{char, digit, letter, space, spaces, string};
use combine::parser::char::{char, digit, space, spaces, string};
use combine::parser::range::{take_while, take_while1};
use combine::parser::repeat::escaped;
use combine::parser::Parser;
use combine::{
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
};
use combine::{error::StringStreamError, parser::combinator::recognize};
use once_cell::sync::Lazy;
use regex::Regex;
fn field<'a>() -> impl Parser<&'a str, Output = String> {
(
(letter().or(char('_'))),
many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
)
.skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
];
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
/// Parses a field_name
/// A field name must have at least one character and be followed by a colon.
/// All characters are allowed including special characters `SPECIAL_CHARS`, but these
/// need to be escaped with a backslack character '\'.
fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
static ESCAPED_SPECIAL_CHARS_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap());
recognize::<String, _, _>(escaped(
(
take_while1(|c| !SPECIAL_CHARS.contains(&c) && c != '-'),
take_while(|c| !SPECIAL_CHARS.contains(&c)),
),
'\\',
satisfy(|c| SPECIAL_CHARS.contains(&c)),
))
.skip(char(':'))
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
.and_then(|s: String| match s.is_empty() {
true => Err(StringStreamError::UnexpectedParse),
_ => Ok(s),
})
}
fn word<'a>() -> impl Parser<&'a str, Output = String> {
@@ -98,7 +121,7 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
let term_val_with_field = negative_number().or(term_val());
(field(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
field_name: Some(field_name),
phrase,
})
@@ -195,7 +218,7 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
);
(
optional(field()).skip(spaces()),
optional(field_name()).skip(spaces()),
// try elastic first, if it matches, the range is unbounded
attempt(elastic_unbounded_range).or(lower_to_upper),
)
@@ -464,21 +487,21 @@ mod test {
#[test]
fn test_parse_elastic_query_ranges() {
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("title: >a", "\"title\":{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title:>=a", "\"title\":[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title: <a", "\"title\":{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("title:<=a", "\"title\":{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("title:<=bsd", "\"title\":{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: >70", "\"weight\":{\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight:>=70", "\"weight\":[\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <70", "\"weight\":{\"*\" TO \"70\"}");
test_parse_query_to_ast_helper("weight:<=70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: >60.7", "\"weight\":{\"60.7\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
}
#[test]
@@ -491,22 +514,43 @@ mod test {
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
super::field_name().parse(".my.field.name:a"),
Ok((".my.field.name".to_string(), "a"))
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
super::field_name().parse("my\\ field\\ name:a"),
Ok(("my field name".to_string(), "a"))
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert!(super::field_name().parse("my field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
super::field_name().parse("\\(1\\+1\\):2"),
Ok(("(1+1)".to_string(), "2"))
);
assert_eq!(
super::field_name().parse("my_field_name:a"),
Ok(("my_field_name".to_string(), "a"))
);
assert!(super::field_name().parse("my_field_name").is_err());
assert!(super::field_name().parse(":a").is_err());
assert!(super::field_name().parse("-my_field:a").is_err());
assert_eq!(
super::field_name().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test]
fn test_field_name_re() {
let escaped_special_chars_re = Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap();
for special_char in SPECIAL_CHARS.iter() {
assert_eq!(
escaped_special_chars_re.replace_all(&format!("\\{}", special_char), "$1"),
special_char.to_string()
);
}
}
#[test]
fn test_range_parser() {
// testing the range() parser separately
@@ -600,12 +644,14 @@ mod test {
#[test]
fn test_single_term_with_field() {
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
}
#[test]
fn test_single_term_with_float() {
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
}
#[test]
@@ -621,22 +667,27 @@ mod test {
#[test]
fn test_parse_test_query_other() {
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
test_parse_query_to_ast_helper("+abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+abc:\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-abc:\"toto\")");
test_parse_query_to_ast_helper("abc:a b", "(*abc:\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
test_is_parse_err("--abc:toto");
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
}
#[test]
fn test_parse_query_with_range() {
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:{a TO z}", "\"foo\":{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "\"foo\":[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "\"foo\":{\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "\"foo\":[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper(
"1.2.foo.bar:[1.1 TO *}",
"\"1.2.foo.bar\":[\"1.1\" TO \"*\"}",
);
test_is_parse_err("abc + ");
}
}

View File

@@ -24,7 +24,7 @@ impl Debug for UserInputLeaf {
ref upper,
} => {
if let Some(ref field) = field {
write!(formatter, "{}:", field)?;
write!(formatter, "\"{}\":", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
@@ -45,7 +45,7 @@ pub struct UserInputLiteral {
impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match self.field_name {
Some(ref field_name) => write!(formatter, "{}:\"{}\"", field_name, self.phrase),
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
None => write!(formatter, "\"{}\"", self.phrase),
}
}
@@ -79,7 +79,7 @@ impl UserInputBound {
match *self {
UserInputBound::Inclusive(ref contents) => contents,
UserInputBound::Exclusive(ref contents) => contents,
UserInputBound::Unbounded => &"*",
UserInputBound::Unbounded => "*",
}
}
}

View File

@@ -297,10 +297,8 @@ impl Collector for FacetCollector {
if depth == collapse_depth + 1 {
collapsed_id = collapse_facet_ords.len();
collapse_facet_ords.push(facet_streamer.term_ord());
collapse_mapping.push(collapsed_id);
} else {
collapse_mapping.push(collapsed_id);
}
collapse_mapping.push(collapsed_id);
}
break;
}

View File

@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
/// The `FilterCollector` collector filters docs using a u64 fast field value and a predicate.
/// The `FilterCollector` collector filters docs using a fast field value and a predicate.
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
///
/// ```rust

View File

@@ -276,7 +276,7 @@ mod tests {
let mut collectors = MultiCollector::new();
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
let count_handler = collectors.add_collector(Count);
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
let mut multifruits = searcher.search(&query, &collectors).unwrap();
assert_eq!(count_handler.extract(&mut multifruits), 5);
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);

View File

@@ -1080,7 +1080,7 @@ mod tests {
query: &str,
query_field: Field,
schema: Schema,
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
mut doc_adder: impl FnMut(&mut IndexWriter),
) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();

View File

@@ -1,203 +0,0 @@
mod bitset;
mod composite_file;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use byteorder::LittleEndian as Endianness;
pub use common::CountingWriter;
pub use common::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
})
}
#[cfg(test)]
pub(crate) mod test {
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use common::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
use tantivy_bitpacker::compute_num_bits;
pub use tantivy_bitpacker::minmax;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}
}

View File

@@ -42,7 +42,7 @@ fn load_metas(
"Meta file does not contain valid utf8 file.".to_string(),
)
})?;
IndexMeta::deserialize(&meta_string, &inventory)
IndexMeta::deserialize(&meta_string, inventory)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.to_path_buf(),
@@ -120,7 +120,7 @@ impl IndexBuilder {
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
@@ -229,7 +229,8 @@ impl Index {
/// Creates a new index using the `RamDirectory`.
///
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
/// This is useful for indexing small set of documents
/// for instances like unit test or temporary in memory index.
pub fn create_in_ram(schema: Schema) -> Index {
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
}
@@ -237,7 +238,7 @@ impl Index {
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
/// If a previous index was in this directory, then it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(
directory_path: P,
@@ -523,7 +524,22 @@ impl Index {
/// Returns the set of corrupted files
pub fn validate_checksum(&self) -> crate::Result<HashSet<PathBuf>> {
self.directory.list_damaged().map_err(Into::into)
let managed_files = self.directory.list_managed_files();
let active_segments_files: HashSet<PathBuf> = self
.searchable_segment_metas()?
.iter()
.flat_map(|segment_meta| segment_meta.list_files())
.collect();
let active_existing_files: HashSet<&PathBuf> =
active_segments_files.intersection(&managed_files).collect();
let mut damaged_files = HashSet::new();
for path in active_existing_files {
if !self.directory.validate_checksum(path)? {
damaged_files.insert((*path).clone());
}
}
Ok(damaged_files)
}
}
@@ -604,7 +620,7 @@ mod tests {
.is_ok());
assert!(Index::exists(&directory).unwrap());
assert!(Index::create(
directory.clone(),
directory,
Schema::builder().build(),
IndexSettings::default()
)

View File

@@ -101,6 +101,7 @@ impl SegmentMeta {
/// Returns the list of files that
/// are required for the segment meta.
/// Note: Some of the returned files may not exist depending on the state of the segment.
///
/// This is useful as the way tantivy removes files
/// is by removing all files that have been created by tantivy
@@ -369,7 +370,6 @@ mod tests {
schema::{Schema, TEXT},
IndexSettings, IndexSortByField, Order,
};
use serde_json;
#[test]
fn test_serialize_metas() {

View File

@@ -1,6 +1,5 @@
use std::io;
use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::TermInfo;
@@ -8,6 +7,7 @@ use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::termdict::TermDictionary;
use common::BinarySerializable;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.

View File

@@ -16,7 +16,6 @@ pub use self::index_meta::{
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment::Segment;
pub use self::segment::SerializableSegment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;

View File

@@ -1,13 +1,12 @@
use super::SegmentComponent;
use crate::core::Index;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::directory::error::{OpenReadError, OpenWriteError};
use crate::directory::Directory;
use crate::directory::{FileSlice, WritePtr};
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::schema::Schema;
use crate::Opstamp;
use crate::{core::Index, indexer::doc_id_mapping::DocIdMapping};
use std::fmt;
use std::path::PathBuf;
@@ -90,20 +89,3 @@ impl Segment {
Ok(write)
}
}
pub trait SerializableSegment {
/// Writes a view of a segment by pushing information
/// to the `SegmentSerializer`.
///
/// # Returns
/// The number of documents in the segment.
///
/// doc_id_map is used when index is created and sorted, to map to the new doc_id order.
/// It is not used by the `IndexMerger`, since the doc_id_mapping on cross-segments works
/// differently
fn write(
&self,
serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<u32>;
}

View File

@@ -22,7 +22,7 @@ use std::sync::atomic;
pub struct SegmentId(Uuid);
#[cfg(test)]
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(atomic::AtomicUsize::default);
#[cfg(test)]
const ZERO_ARRAY: [u8; 8] = [0u8; 8];
@@ -108,6 +108,12 @@ impl fmt::Debug for SegmentId {
}
}
impl fmt::Display for SegmentId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Seg({:?})", self.short_uuid_string())
}
}
impl PartialOrd for SegmentId {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))

View File

@@ -1,9 +1,10 @@
use crate::common::HasLen;
use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
use crate::core::SegmentId;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::error::DataCorruption;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
@@ -15,7 +16,6 @@ use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::DocId;
use crate::{common::CompositeFile, error::DataCorruption};
use fail::fail_point;
use std::fmt;
use std::sync::Arc;
@@ -32,9 +32,6 @@ use std::{collections::HashMap, io};
///
/// The segment reader has a very low memory footprint,
/// as close to all of the memory data is mmapped.
///
///
/// TODO fix not decoding docfreq
#[derive(Clone)]
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
@@ -57,17 +54,12 @@ pub struct SegmentReader {
impl SegmentReader {
/// Returns the highest document id ever attributed in
/// this segment + 1.
/// Today, `tantivy` does not handle deletes, so it happens
/// to also be the number of documents in the index.
pub fn max_doc(&self) -> DocId {
self.max_doc
}
/// Returns the number of documents.
/// Returns the number of alive documents.
/// Deleted documents are not counted.
///
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.num_docs
}
@@ -81,7 +73,7 @@ impl SegmentReader {
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.len() as DocId)
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
}
@@ -329,6 +321,32 @@ mod test {
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::DocId;
#[test]
fn test_num_alive() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap"));
// ok, now we should have a deleted doc
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
assert_eq!(2, searcher.segment_reader(0).num_docs());
assert_eq!(4, searcher.segment_reader(0).max_doc());
Ok(())
}
#[test]
fn test_alive_docs_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -1,18 +1,17 @@
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::common::VInt;
use crate::directory::FileSlice;
use crate::directory::{TerminatingWrite, WritePtr};
use crate::schema::Field;
use crate::space_usage::FieldUsage;
use crate::space_usage::PerFieldSpaceUsage;
use common::BinarySerializable;
use common::CountingWriter;
use common::HasLen;
use common::VInt;
use std::collections::HashMap;
use std::io::{self, Read, Write};
use std::iter::ExactSizeIterator;
use std::ops::Range;
use super::HasLen;
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
field: Field,
@@ -188,10 +187,10 @@ impl CompositeFile {
mod test {
use super::{CompositeFile, CompositeWrite};
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::directory::{Directory, RamDirectory};
use crate::schema::Field;
use common::BinarySerializable;
use common::VInt;
use std::io::Write;
use std::path::Path;

View File

@@ -1,7 +1,7 @@
use stable_deref_trait::StableDeref;
use crate::common::HasLen;
use crate::directory::OwnedBytes;
use common::HasLen;
use std::fmt;
use std::ops::Range;
use std::sync::{Arc, Weak};
@@ -32,12 +32,6 @@ impl FileHandle for &'static [u8] {
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.deref().len()
}
}
impl<B> From<B> for FileSlice
where
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
@@ -178,7 +172,7 @@ impl HasLen for FileSlice {
#[cfg(test)]
mod tests {
use super::{FileHandle, FileSlice};
use crate::common::HasLen;
use common::HasLen;
use std::io;
#[test]
@@ -211,7 +205,7 @@ mod tests {
assert_eq!(right.read_bytes()?.as_slice(), b"");
}
{
let (left, right) = file_slice.clone().split_from_end(2);
let (left, right) = file_slice.split_from_end(2);
assert_eq!(left.read_bytes()?.as_slice(), b"abcd");
assert_eq!(right.read_bytes()?.as_slice(), b"ef");
}

View File

@@ -1,10 +1,10 @@
use crate::directory::error::Incompatibility;
use crate::directory::FileSlice;
use crate::{
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
directory::{AntiCallToken, TerminatingWrite},
Version, INDEX_FORMAT_VERSION,
};
use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen};
use crc32fast::Hasher;
use serde::{Deserialize, Serialize};
use std::io;
@@ -156,10 +156,8 @@ mod tests {
use crate::directory::footer::Footer;
use crate::directory::OwnedBytes;
use crate::{
common::BinarySerializable,
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
};
use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice};
use common::BinarySerializable;
use std::io;
#[test]

View File

@@ -1,4 +1,4 @@
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::GarbageCollectionResult;
@@ -248,24 +248,15 @@ impl ManagedDirectory {
Ok(footer.crc() == crc)
}
/// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut managed_paths = self
/// List all managed files
pub fn list_managed_files(&self) -> HashSet<PathBuf> {
let managed_paths = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in list damaged.")
.managed_paths
.clone();
managed_paths.remove(*META_FILEPATH);
let mut damaged_files = HashSet::new();
for path in managed_paths {
if !self.validate_checksum(&path)? {
damaged_files.insert(path);
}
}
Ok(damaged_files)
managed_paths
}
}
@@ -336,7 +327,6 @@ mod tests_mmap_specific {
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
use std::collections::HashSet;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
@@ -402,44 +392,7 @@ mod tests_mmap_specific {
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
assert!(managed_directory.garbage_collect(|| living_files).is_ok());
assert!(!managed_directory.exists(test_path1).unwrap());
} else {
assert!(!managed_directory.exists(test_path1).unwrap());
}
}
#[test]
fn test_checksum() -> crate::Result<()> {
let test_path1: &'static Path = Path::new("some_path_for_test");
let test_path2: &'static Path = Path::new("other_test_path");
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let mmap_directory = MmapDirectory::open(&tempdir_path)?;
let managed_directory = ManagedDirectory::wrap(mmap_directory)?;
let mut write = managed_directory.open_write(test_path1)?;
write.write_all(&[0u8, 1u8])?;
write.terminate()?;
let mut write = managed_directory.open_write(test_path2)?;
write.write_all(&[3u8, 4u8, 5u8])?;
write.terminate()?;
let read_file = managed_directory.open_read(test_path2)?.read_bytes()?;
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
assert!(managed_directory.list_damaged().unwrap().is_empty());
let mut corrupted_path = tempdir_path.clone();
corrupted_path.push(test_path2);
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
file.write_all(&[255u8])?;
file.flush()?;
drop(file);
let damaged = managed_directory.list_damaged()?;
assert_eq!(damaged.len(), 1);
assert!(damaged.contains(test_path2));
Ok(())
assert!(!managed_directory.exists(test_path1).unwrap());
}
}

View File

@@ -11,7 +11,7 @@ use crate::directory::{AntiCallToken, FileHandle, OwnedBytes};
use crate::directory::{ArcBytes, WeakArcBytes};
use crate::directory::{TerminatingWrite, WritePtr};
use fs2::FileExt;
use memmap::Mmap;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use stable_deref_trait::StableDeref;
use std::convert::From;
@@ -53,7 +53,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
return Ok(None);
}
unsafe {
memmap::Mmap::map(&file)
memmap2::Mmap::map(&file)
.map(Some)
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
}
@@ -485,13 +485,14 @@ mod tests {
// The following tests are specific to the MmapDirectory
use super::*;
use crate::indexer::LogMergePolicy;
use crate::Index;
use crate::ReloadPolicy;
use crate::{common::HasLen, indexer::LogMergePolicy};
use crate::{
schema::{Schema, SchemaBuilder, TEXT},
IndexSettings,
};
use common::HasLen;
#[test]
fn test_open_non_existent_path() {

View File

@@ -20,6 +20,9 @@ mod watch_event_router;
/// Errors specific to the directory module.
pub mod error;
mod composite_file;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::directory::DirectoryLock;
pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};

View File

@@ -1,290 +1,11 @@
use crate::directory::FileHandle;
use stable_deref_trait::StableDeref;
use std::convert::TryInto;
use std::mem;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
use std::io;
use std::ops::Range;
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]
pub struct OwnedBytes {
data: &'static [u8],
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
}
pub use ownedbytes::OwnedBytes;
impl FileHandle for OwnedBytes {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
Ok(self.slice(range))
}
}
impl OwnedBytes {
/// Creates an empty `OwnedBytes`.
pub fn empty() -> OwnedBytes {
OwnedBytes::new(&[][..])
}
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let bytes: &[u8] = box_stable_deref.as_ref();
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
OwnedBytes {
data,
box_stable_deref,
}
}
/// creates a fileslice that is just a view over a slice of the data.
pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes {
data: &self.data[range],
box_stable_deref: self.box_stable_deref.clone(),
}
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline]
pub fn as_slice(&self) -> &[u8] {
self.data
}
/// Returns the len of the slice.
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
///
/// This operation is cheap and does not require to copy any memory.
/// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped.
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline]
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
pub fn read_u8(&mut self) -> u8 {
assert!(!self.is_empty());
let byte = self.as_slice()[0];
self.advance(1);
byte
}
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
pub fn read_u64(&mut self) -> u64 {
assert!(self.len() > 7);
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
self.advance(8);
u64::from_le_bytes(octlet)
}
}
impl fmt::Debug for OwnedBytes {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We truncate the bytes in order to make sure the debug string
// is not too long.
let bytes_truncated: &[u8] = if self.len() > 8 {
&self.as_slice()[..10]
} else {
self.as_slice()
};
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
}
}
impl Deref for OwnedBytes {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
let read_len = self.read(buf)?;
if read_len != buf.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
));
}
Ok(())
}
}
impl AsRef<[u8]> for OwnedBytes {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};
use super::OwnedBytes;
#[test]
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{:?}", short_bytes),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{:?}", long_bytes),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}
#[test]
fn test_owned_bytes_read() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
{
let mut buf = [0u8; 5];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
}
{
let mut buf = [0u8; 2];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"fg");
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
}
Ok(())
}
#[test]
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 5];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
assert_eq!(&buf, b"abcde");
Ok(())
}
#[test]
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 7];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf[..5], b"abcde");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_to_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = Vec::new();
bytes.read_to_end(&mut buf)?;
assert_eq!(buf.as_slice(), b"abcde".as_ref());
Ok(())
}
#[test]
fn test_owned_bytes_read_u8() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
assert_eq!(bytes.read_u8(), 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_u64() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
assert_eq!(bytes.read_u64(), u64::MAX - 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_split() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
let (left, right) = bytes.split(3);
assert_eq!(left.as_slice(), b"abc");
assert_eq!(right.as_slice(), b"defghi");
}
#[test]
fn test_owned_bytes_split_boundary() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
{
let (left, right) = bytes.clone().split(0);
assert_eq!(left.as_slice(), b"");
assert_eq!(right.as_slice(), b"abcdefghi");
}
{
let (left, right) = bytes.split(9);
assert_eq!(left.as_slice(), b"abcdefghi");
assert_eq!(right.as_slice(), b"");
}
}
}

View File

@@ -1,9 +1,10 @@
use crate::core::META_FILEPATH;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use crate::directory::AntiCallToken;
use crate::directory::WatchCallbackList;
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr};
use crate::{common::HasLen, core::META_FILEPATH};
use common::HasLen;
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;

View File

@@ -166,26 +166,26 @@ fn test_write_create_the_file(directory: &dyn Directory) {
fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test");
assert!(directory.open_read(test_path).is_err());
let mut write_file = directory.open_write(&test_path)?;
let mut write_file = directory.open_write(test_path)?;
write_file.write_all(&[1, 2, 3, 4])?;
write_file.flush()?;
{
let read_handle = directory.open_read(&test_path)?.read_bytes()?;
let read_handle = directory.open_read(test_path)?.read_bytes()?;
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(&test_path).is_ok());
assert!(directory.delete(test_path).is_ok());
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
if cfg!(windows) {
assert!(directory.delete(&test_path).is_ok());
assert!(directory.delete(test_path).is_ok());
}
assert!(directory.open_read(&test_path).is_err());
assert!(directory.delete(&test_path).is_err());
assert!(directory.open_read(test_path).is_err());
assert!(directory.delete(test_path).is_err());
Ok(())
}

View File

@@ -83,11 +83,11 @@ impl BytesFastFieldWriter {
&'a self,
doc_id_map: Option<&'b DocIdMapping>,
) -> impl Iterator<Item = &'b [u8]> {
let doc_id_iter = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids().cloned()) as Box<dyn Iterator<Item = u32>>
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids())
} else {
Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
as Box<dyn Iterator<Item = u32>>
let max_doc = self.doc_index.len() as u32;
Box::new(0..max_doc)
};
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
}

View File

@@ -1,9 +1,10 @@
use crate::common::{BitSet, HasLen};
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::WritePtr;
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::HasLen;
use std::io;
use std::io::Write;
@@ -91,6 +92,10 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
@@ -106,7 +111,7 @@ impl HasLen for DeleteBitSet {
#[cfg(test)]
mod tests {
use super::DeleteBitSet;
use crate::common::HasLen;
use common::HasLen;
#[test]
fn test_delete_bitset_empty() {

View File

@@ -40,11 +40,11 @@ pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::Cardinality;
use crate::schema::FieldType;
use crate::schema::Value;
use crate::DocId;
use crate::{
chrono::{NaiveDateTime, Utc},
schema::Type,
};
use crate::{common, DocId};
mod bytes;
mod delete;
@@ -213,8 +213,7 @@ fn value_to_u64(value: &Value) -> u64 {
mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::Field;
@@ -222,6 +221,7 @@ mod tests {
use crate::schema::FAST;
use crate::schema::{Document, IntOptions};
use crate::{Index, SegmentId, SegmentReader};
use common::HasLen;
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -267,8 +267,8 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 37 as usize);
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
@@ -298,8 +298,8 @@ mod tests {
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
}
let file = directory.open_read(&path)?;
assert_eq!(file.len(), 62 as usize);
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -334,8 +334,8 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 35 as usize);
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -366,8 +366,8 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 80043 as usize);
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -405,9 +405,9 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
let file = directory.open_read(path).unwrap();
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
assert_eq!(file.len(), 10175 as usize); // linear interpol size
assert_eq!(file.len(), 10175_usize); // linear interpol size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
@@ -447,7 +447,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
let file = directory.open_read(path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
@@ -480,7 +480,7 @@ mod tests {
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
}
let file = directory.open_read(&path)?;
let file = directory.open_read(path)?;
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -588,7 +588,7 @@ mod bench {
use super::tests::FIELD;
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use std::collections::HashMap;

View File

@@ -8,14 +8,22 @@ pub use self::writer::MultiValuedFastFieldWriter;
mod tests {
use crate::collector::TopDocs;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::Cardinality;
use crate::schema::Facet;
use crate::schema::IntOptions;
use crate::schema::Schema;
use crate::schema::INDEXED;
use crate::Document;
use crate::Index;
use crate::Term;
use chrono::Duration;
use futures::executor::block_on;
use proptest::prop_oneof;
use proptest::proptest;
use proptest::strategy::Strategy;
use test_env_log::test;
#[test]
fn test_multivalued_u64() {
@@ -90,7 +98,7 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
@@ -121,7 +129,7 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
@@ -225,6 +233,111 @@ mod tests {
multi_value_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
}
fn test_multivalued_no_panic(ops: &[IndexingOp]) {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
match id % 3 {
0 => {
index_writer.add_document(doc!());
}
1 => {
let mut doc = Document::new();
for _ in 0..5001 {
doc.add_u64(field, id as u64);
}
index_writer.add_document(doc);
}
_ => {
let mut doc = Document::new();
doc.add_u64(field, id as u64);
index_writer.add_document(doc);
}
};
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(field, id as u64));
}
IndexingOp::Commit => {
index_writer.commit().unwrap();
}
IndexingOp::Merge => {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
}
}
}
}
assert!(index_writer.commit().is_ok());
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if !segment_ids.is_empty() {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
}
}
#[derive(Debug, Clone, Copy)]
enum IndexingOp {
AddDoc { id: u32 },
DeleteDoc { id: u32 },
Commit,
Merge,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u32..10u32).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u32..10u32).prop_map(|id| IndexingOp::AddDoc { id }),
(0u32..2u32).prop_map(|_| IndexingOp::Commit),
(0u32..1u32).prop_map(|_| IndexingOp::Merge),
]
}
proptest! {
#[test]
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
test_multivalued_no_panic(&ops[..]);
}
}
#[test]
fn test_multivalued_proptest_off_by_one_bug_1151() {
use IndexingOp::*;
let ops = [
AddDoc { id: 3 },
AddDoc { id: 1 },
AddDoc { id: 3 },
Commit,
Merge,
];
test_multivalued_no_panic(&ops[..]);
}
#[test]
#[ignore]
fn test_many_facets() {

View File

@@ -1,8 +1,6 @@
use std::ops::Range;
use crate::fastfield::{
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -16,13 +14,13 @@ use crate::DocId;
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
@@ -32,6 +30,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
@@ -39,20 +38,41 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
self.vals_reader.get_range(range.start, &mut vals[..]);
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual mimimum value.
pub fn min_value(&self) -> Item {
self.vals_reader.min_value()
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self) -> Item {
self.vals_reader.max_value()
}
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
}
/// Returns the overall number of values in this field .
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
@@ -71,7 +91,7 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
mod tests {
use crate::core::Index;
use crate::schema::{Facet, Schema, INDEXED};
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
#[test]
fn test_multifastfield_reader() {
@@ -126,4 +146,32 @@ mod tests {
assert_eq!(&vals[..], &[4]);
}
}
#[test]
fn test_multifastfield_reader_min_max() {
let mut schema_builder = Schema::builder();
let field_options = IntOptions::default()
.set_indexed()
.set_fast(Cardinality::MultiValues);
let item_field = schema_builder.add_i64_field("items", field_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_for_tests()
.expect("Failed to create index writer.");
index_writer.add_document(doc!(
item_field => 2i64,
item_field => 3i64,
item_field => -2i64,
));
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
index_writer.add_document(doc!(item_field => 4i64));
index_writer.commit().expect("Commit failed");
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
assert_eq!(field_reader.min_value(), -2);
assert_eq!(field_reader.max_value(), 6);
}
}

View File

@@ -102,11 +102,11 @@ impl MultiValuedFastFieldWriter {
&'a self,
doc_id_map: Option<&'b DocIdMapping>,
) -> impl Iterator<Item = &'b [u64]> {
let doc_id_iter = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids().cloned()) as Box<dyn Iterator<Item = u32>>
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids())
} else {
Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
as Box<dyn Iterator<Item = u32>>
let max_doc = self.doc_index.len() as DocId;
Box::new(0..max_doc)
};
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
}

View File

@@ -1,6 +1,5 @@
use super::FastValue;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
@@ -8,6 +7,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
@@ -44,13 +44,13 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]);
fn get_range(&self, start: u64, output: &mut [Item]);
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual mimimum value.
fn min_value(&self) -> Item;
/// Returns the maximum value for this fast field.
@@ -120,7 +120,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::MultiLinearInterpol(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
@@ -226,8 +226,8 @@ impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}
/// Returns the minimum value for this fast field.

View File

@@ -1,4 +1,4 @@
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
@@ -99,12 +99,19 @@ impl FastFieldReaders {
Ok(())
}
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field: Field,
index: usize,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, index)?;
DynamicFastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
DynamicFastFieldReader::open(fast_field_slice)
self.typed_fast_field_reader_with_idx(field, 0)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
@@ -112,9 +119,7 @@ impl FastFieldReaders {
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}

View File

@@ -1,8 +1,8 @@
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::schema::Field;
use common::BinarySerializable;
use common::CountingWriter;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
@@ -46,11 +46,7 @@ fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
if !T::is_applicable(fastfield_accessor, stats.clone()) {
return;
}
let (ratio, name, id) = (
T::estimate(fastfield_accessor, stats.clone()),
T::NAME,
T::ID,
);
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
estimations.push((ratio, name, id));
}
@@ -71,7 +67,26 @@ impl CompositeFastFieldSerializer {
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
self.create_auto_detect_u64_fast_field_with_idx(
field,
stats,
fastfield_accessor,
data_iter_1,
data_iter_2,
0,
)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
pub fn create_auto_detect_u64_fast_field_with_idx(
&mut self,
field: Field,
stats: FastFieldStats,
fastfield_accessor: impl FastFieldDataAccess,
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
let mut estimations = vec![];
@@ -90,9 +105,7 @@ impl CompositeFastFieldSerializer {
&fastfield_accessor,
&mut estimations,
);
if let Some(broken_estimation) = estimations
.iter()
.find(|estimation| estimation.0 == f32::NAN)
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{
warn!(
"broken estimation for fast field codec {}",

View File

@@ -1,13 +1,12 @@
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::FastFieldDataAccess;
use crate::common;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use crate::DocId;
use common;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
@@ -296,7 +295,7 @@ impl IntFastFieldWriter {
if let Some(doc_id_map) = doc_id_map {
let iter = doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(*doc_id as usize));
.map(|doc_id| self.vals.get(doc_id as usize));
serializer.create_auto_detect_u64_fast_field(
self.field,
stats,
@@ -323,16 +322,17 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker,
}
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given document.
/// Return the value associated to the given doc.
///
/// This accessor should return as fast as possible.
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> u64 {
/// May panic if `doc` is greater than the index.
fn get_val(&self, doc: u64) -> u64 {
if let Some(doc_id_map) = self.doc_id_map {
self.vals.get(doc_id_map.get_old_doc_id(doc) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
self.vals
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
} else {
self.vals.get(doc as usize)
}

View File

@@ -1,5 +1,5 @@
use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::schema::Field;

View File

@@ -1,4 +1,4 @@
use crate::common::CompositeWrite;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::schema::Field;
use std::io;

View File

@@ -98,7 +98,7 @@ impl FieldNormsWriter {
let mut mapped_fieldnorm_values = vec![];
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[*old_doc_id as usize];
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
}
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
} else {

View File

@@ -1,4 +1,8 @@
use crate::schema;
use crate::Index;
use crate::IndexSettings;
use crate::IndexSortByField;
use crate::Order;
use crate::Searcher;
use crate::{doc, schema::*};
use rand::thread_rng;
@@ -35,10 +39,10 @@ fn test_functional_store() -> crate::Result<()> {
let mut doc_set: Vec<u64> = Vec::new();
let mut doc_id = 0u64;
for iteration in 0..500 {
for iteration in 0..get_num_iterations() {
dbg!(iteration);
let num_docs: usize = rng.gen_range(0..4);
if doc_set.len() >= 1 {
if !doc_set.is_empty() {
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
@@ -56,16 +60,37 @@ fn test_functional_store() -> crate::Result<()> {
Ok(())
}
fn get_num_iterations() -> usize {
std::env::var("NUM_FUNCTIONAL_TEST_ITERATIONS")
.map(|str| str.parse().unwrap())
.unwrap_or(2000)
}
#[test]
#[ignore]
fn test_functional_indexing() -> crate::Result<()> {
fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Desc,
}),
..Default::default()
});
let index = index_builder.create_from_tempdir().unwrap();
let reader = index.reader()?;
let mut rng = thread_rng();
@@ -75,7 +100,7 @@ fn test_functional_indexing() -> crate::Result<()> {
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
@@ -88,19 +113,95 @@ fn test_functional_indexing() -> crate::Result<()> {
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
doc.add_text(text_field, get_text());
index_writer.add_document(doc);
}
}
Ok(())
}
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.";
fn get_text() -> String {
use rand::seq::SliceRandom;
let mut rng = thread_rng();
let tokens: Vec<_> = LOREM.split(' ').collect();
let random_val = rng.gen_range(0..20);
(0..random_val)
.map(|_| tokens.choose(&mut rng).unwrap())
.cloned()
.collect::<Vec<_>>()
.join(" ")
}
#[test]
#[ignore]
fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload()?;
let searcher = reader.searcher();
// check that everything is correct.
check_index_content(
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
doc.add_text(text_field, get_text());
index_writer.add_document(doc);
}
}
Ok(())

View File

@@ -1,6 +1,6 @@
use super::operation::DeleteOperation;
use crate::Opstamp;
use std::mem;
use std::ops::DerefMut;
use std::sync::{Arc, RwLock, Weak};
@@ -105,7 +105,7 @@ impl DeleteQueue {
return None;
}
let delete_operations = mem::replace(&mut self_wlock.writer, vec![]);
let delete_operations = std::mem::take(&mut self_wlock.writer);
let new_block = Arc::new(Block {
operations: Arc::from(delete_operations.into_boxed_slice()),
@@ -286,7 +286,7 @@ mod tests {
operations_it.advance();
}
{
let mut operations_it = snapshot.clone();
let mut operations_it = snapshot;
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 2);

View File

@@ -2,13 +2,61 @@
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
//!
use super::SegmentWriter;
use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
use crate::{
schema::{Field, Schema},
DocId, IndexSortByField, Order, TantivyError,
};
use std::cmp::Reverse;
/// Struct to provide mapping from old doc_id to new doc_id and vice versa
use std::{cmp::Reverse, ops::Index};
/// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)]
pub(crate) struct SegmentDocidMapping<'a> {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
is_trivial: bool,
}
impl<'a> SegmentDocidMapping<'a> {
pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
is_trivial: bool,
) -> Self {
Self {
new_doc_id_to_old_and_segment,
is_trivial,
}
}
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
self.new_doc_id_to_old_and_segment.iter()
}
pub(crate) fn len(&self) -> usize {
self.new_doc_id_to_old_and_segment.len()
}
/// This flags means the segments are simply stacked in the order of their ordinal.
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
///
/// This allows for some optimization.
pub(crate) fn is_trivial(&self) -> bool {
self.is_trivial
}
}
impl<'a> Index<usize> for SegmentDocidMapping<'a> {
type Output = (DocId, SegmentReaderWithOrdinal<'a>);
fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx]
}
}
impl<'a> IntoIterator for SegmentDocidMapping<'a> {
type Item = (DocId, SegmentReaderWithOrdinal<'a>);
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.new_doc_id_to_old_and_segment.into_iter()
}
}
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
pub struct DocIdMapping {
new_doc_id_to_old: Vec<DocId>,
old_doc_id_to_new: Vec<DocId>,
@@ -24,8 +72,8 @@ impl DocIdMapping {
self.new_doc_id_to_old[doc_id as usize]
}
/// iterate over old doc_ids in order of the new doc_ids
pub fn iter_old_doc_ids(&self) -> std::slice::Iter<'_, DocId> {
self.new_doc_id_to_old.iter()
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned()
}
}

View File

@@ -14,35 +14,27 @@ use crate::Opstamp;
// The doc to opstamp mapping stores precisely an array
// indexed by doc id and storing the opstamp of the document.
//
// This mapping is (for the moment) stricly increasing
// because of the way document id are allocated.
// This mapping is NOT necessarily increasing, because
// we might be sorting documents according to a fast field.
#[derive(Clone)]
pub enum DocToOpstampMapping<'a> {
WithMap(&'a [Opstamp]),
None,
}
impl<'a> From<&'a [u64]> for DocToOpstampMapping<'a> {
fn from(opstamps: &[Opstamp]) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(opstamps)
}
}
impl<'a> DocToOpstampMapping<'a> {
/// Given an opstamp return the limit doc id L
/// such that all doc id D such that
// D >= L iff opstamp(D) >= than `target_opstamp`.
//
// The edge case opstamp = some doc opstamp is in practise
// never called.
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {
Ok(doc_id) | Err(doc_id) => doc_id as DocId,
}
/// Assess whether a document should be considered deleted given that it contains
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
///
/// This function returns true if the `DocToOpstamp` mapping is none or if
/// the `doc_opstamp` is anterior to the delete opstamp.
pub fn is_deleted(&self, doc_id: DocId, delete_opstamp: Opstamp) -> bool {
match self {
Self::WithMap(doc_opstamps) => {
let doc_opstamp = doc_opstamps[doc_id as usize];
doc_opstamp < delete_opstamp
}
DocToOpstampMapping::None => DocId::max_value(),
Self::None => true,
}
}
}
@@ -55,40 +47,17 @@ mod tests {
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
assert!(doc_to_opstamp_mapping.is_deleted(1u32, 0u64));
assert!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64));
}
#[test]
fn test_doc_to_opstamp_mapping_complex() {
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[][..]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[1u64][..]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
}
{
let doc_to_opstamp_mapping =
DocToOpstampMapping::from(&[1u64, 12u64, 17u64, 23u64][..]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
for i in 2u64..13u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
}
for i in 13u64..18u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 2);
}
for i in 18u64..24u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 3);
}
for i in 24u64..30u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 4);
}
}
fn test_doc_to_opstamp_mapping_with_map() {
let doc_to_opstamp_mapping = DocToOpstampMapping::WithMap(&[5u64, 1u64, 0u64, 4u64, 3u64]);
assert_eq!(doc_to_opstamp_mapping.is_deleted(0u32, 2u64), false);
assert_eq!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64), true);
assert_eq!(doc_to_opstamp_mapping.is_deleted(2u32, 2u64), true);
assert_eq!(doc_to_opstamp_mapping.is_deleted(3u32, 2u64), false);
assert_eq!(doc_to_opstamp_mapping.is_deleted(4u32, 2u64), false);
}
}

View File

@@ -1,7 +1,6 @@
use super::operation::{AddOperation, UserOperation};
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use crate::common::BitSet;
use crate::core::Index;
use crate::core::Segment;
use crate::core::SegmentComponent;
@@ -24,6 +23,7 @@ use crate::schema::Document;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Opstamp;
use common::BitSet;
use crossbeam::channel;
use futures::executor::block_on;
use futures::future::Future;
@@ -106,22 +106,18 @@ fn compute_deleted_bitset(
}
// A delete operation should only affect
// document that were inserted after it.
//
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
// document that were inserted before it.
let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)?
{
let mut deleted_doc = docset.doc();
while deleted_doc != TERMINATED {
if deleted_doc < limit_doc {
delete_bitset.insert(deleted_doc);
let mut doc_matching_deleted_term = docset.doc();
while doc_matching_deleted_term != TERMINATED {
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
delete_bitset.insert(doc_matching_deleted_term);
might_have_changed = true;
}
deleted_doc = docset.advance();
doc_matching_deleted_term = docset.advance();
}
}
delete_cursor.advance();
@@ -230,14 +226,8 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
let delete_bitset_opt = apply_deletes(
&segment_with_max_doc,
&mut delete_cursor,
&doc_opstamps,
last_docstamp,
)?;
let delete_bitset_opt =
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore();
@@ -247,19 +237,26 @@ fn index_documents(
Ok(true)
}
/// `doc_opstamps` is required to be non-empty.
fn apply_deletes(
segment: &Segment,
mut delete_cursor: &mut DeleteCursor,
doc_opstamps: &[Opstamp],
last_docstamp: Opstamp,
) -> crate::Result<Option<BitSet>> {
if delete_cursor.get().is_none() {
// if there are no delete operation in the queue, no need
// to even open the segment.
return Ok(None);
}
let max_doc_opstamp: Opstamp = doc_opstamps
.iter()
.cloned()
.max()
.expect("Empty DocOpstamp is forbidden");
let segment_reader = SegmentReader::open(segment)?;
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_max_value(max_doc);
@@ -268,7 +265,7 @@ fn apply_deletes(
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
max_doc_opstamp,
)?;
Ok(if may_have_deletes {
Some(deleted_bitset)
@@ -358,7 +355,7 @@ impl IndexWriter {
// dropping the last reference to the segment_updater.
self.drop_sender();
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
let former_workers_handles = std::mem::take(&mut self.workers_join_handle);
for join_handle in former_workers_handles {
join_handle
.join()
@@ -628,7 +625,7 @@ impl IndexWriter {
// and recreate a new one.
self.recreate_document_channel();
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
let former_workers_join_handle = std::mem::take(&mut self.workers_join_handle);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
@@ -784,17 +781,44 @@ impl Drop for IndexWriter {
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::collections::HashSet;
use futures::executor::block_on;
use proptest::prelude::*;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use super::super::operation::UserOperation;
use crate::collector::TopDocs;
use crate::directory::error::LockError;
use crate::error::*;
use crate::fastfield::FastFieldReader;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::query::TermQuery;
use crate::schema::{self, IndexRecordOption, STRING};
use crate::schema::Cardinality;
use crate::schema::Facet;
use crate::schema::IntOptions;
use crate::schema::TextFieldIndexing;
use crate::schema::TextOptions;
use crate::schema::STORED;
use crate::schema::TEXT;
use crate::schema::{self, IndexRecordOption, FAST, INDEXED, STRING};
use crate::DocAddress;
use crate::Index;
use crate::ReloadPolicy;
use crate::Term;
use crate::{IndexSettings, IndexSortByField, Order};
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.";
#[test]
fn test_operations_group() {
@@ -1282,6 +1306,343 @@ mod tests {
assert!(commit_again.is_ok());
}
#[test]
fn test_delete_with_sort_by_field() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let id_field =
schema_builder.add_u64_field("id", schema::INDEXED | schema::STORED | schema::FAST);
let schema = schema_builder.build();
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let index_reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
// create and delete docs in same commit
for id in 0u64..5u64 {
index_writer.add_document(doc!(id_field => id));
}
for id in 2u64..4u64 {
index_writer.delete_term(Term::from_field_u64(id_field, id));
}
for id in 5u64..10u64 {
index_writer.add_document(doc!(id_field => id));
}
index_writer.commit()?;
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.num_docs(), 8);
assert_eq!(segment_reader.max_doc(), 10);
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get(doc))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(())
}
#[derive(Debug, Clone, Copy)]
enum IndexingOp {
AddDoc { id: u64 },
DeleteDoc { id: u64 },
Commit,
Merge,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
}
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new();
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
*existing_ids.entry(id).or_insert(0) += 1;
deleted_ids.remove(&id);
}
IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id);
deleted_ids.insert(id);
}
_ => {}
}
}
(existing_ids, deleted_ids)
}
fn test_operation_strategy(
ops: &[IndexingOp],
sort_index: bool,
force_end_merge: bool,
) -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
let text_field = schema_builder.add_text_field(
"text_field",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);
let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED);
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_stored(),
);
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build();
let settings = if sort_index {
IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Asc,
}),
..Default::default()
}
} else {
IndexSettings {
..Default::default()
}
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
index_writer
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id));
}
IndexingOp::Commit => {
index_writer.commit()?;
}
IndexingOp::Merge => {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
}
}
}
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
}
let ids: HashSet<u64> = searcher
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get(doc))
})
.collect();
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
assert_eq!(
ids,
expected_ids_and_num_occurences
.keys()
.cloned()
.collect::<HashSet<_>>()
);
// multivalue fast field tests
for segment_reader in searcher.segment_readers().iter() {
let ff_reader = segment_reader.fast_fields().u64s(multi_numbers).unwrap();
for doc in segment_reader.doc_ids_alive() {
let mut vals = vec![];
ff_reader.get_vals(doc, &mut vals);
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurences.contains_key(&vals[0]));
}
}
// doc store tests
for segment_reader in searcher.segment_readers().iter() {
let store_reader = segment_reader.get_store_reader().unwrap();
// test store iterator
for doc in store_reader.iter(segment_reader.delete_bitset()) {
let id = doc
.unwrap()
.get_first(id_field)
.unwrap()
.u64_value()
.unwrap();
assert!(expected_ids_and_num_occurences.contains_key(&id));
}
// test store random access
for doc_id in segment_reader.doc_ids_alive() {
let id = store_reader
.get(doc_id)
.unwrap()
.get_first(id_field)
.unwrap()
.u64_value()
.unwrap();
assert!(expected_ids_and_num_occurences.contains_key(&id));
let id2 = store_reader
.get(doc_id)
.unwrap()
.get_first(multi_numbers)
.unwrap()
.u64_value()
.unwrap();
assert_eq!(id, id2);
}
}
// test search
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
};
for (existing_id, count) in expected_ids_and_num_occurences {
assert_eq!(do_search(&existing_id.to_string()).len() as u64, count);
}
for existing_id in deleted_ids {
assert_eq!(do_search(&existing_id.to_string()).len(), 0);
}
// test facets
for segment_reader in searcher.segment_readers().iter() {
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
for doc_id in segment_reader.doc_ids_alive() {
let mut facet_ords = Vec::new();
facet_reader.facet_ords(doc_id, &mut facet_ords);
assert_eq!(facet_ords.len(), 1);
let mut facet = Facet::default();
facet_reader
.facet_from_ord(facet_ords[0], &mut facet)
.unwrap();
let id = ff_reader.get(doc_id);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected);
}
}
Ok(())
}
proptest! {
#[test]
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
}
#[test]
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
}
#[test]
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
}
#[test]
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
}
}
#[test]
fn test_delete_with_sort_by_field_last_opstamp_is_not_max() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let sort_by_field = schema_builder.add_u64_field("sort_by", FAST);
let id_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "sort_by".to_string(),
order: Order::Asc,
}),
..Default::default()
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
// We add a doc...
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
// And remove it.
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
// We add another doc.
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
// The expected result is a segment with
// maxdoc = 2
// numdoc = 1.
index_writer.commit()?;
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.max_doc(), 2);
assert_eq!(segment_reader.num_deleted_docs(), 1);
Ok(())
}
#[test]
fn test_index_doc_missing_field() {
let mut schema_builder = schema::Schema::builder();

View File

@@ -1,4 +1,3 @@
use super::doc_id_mapping::DocIdMapping;
use crate::error::DataCorruption;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fastfield::DeleteBitSet;
@@ -6,10 +5,12 @@ use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldDataAccess;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldStats;
use crate::fastfield::MultiValueLength;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::indexer::doc_id_mapping::SegmentDocidMapping;
use crate::indexer::SegmentSerializer;
use crate::postings::Postings;
use crate::postings::{InvertedIndexSerializer, SegmentPostings};
@@ -19,22 +20,28 @@ use crate::schema::{Field, Schema};
use crate::store::StoreWriter;
use crate::termdict::TermMerger;
use crate::termdict::TermOrdinal;
use crate::{common::HasLen, fastfield::MultiValueLength};
use crate::{common::MAX_DOC_LIMIT, IndexSettings};
use crate::IndexSettings;
use crate::IndexSortByField;
use crate::{core::Segment, indexer::doc_id_mapping::expect_field_id_for_sort_field};
use crate::{core::SegmentReader, Order};
use crate::{core::SerializableSegment, IndexSortByField};
use crate::{
docset::{DocSet, TERMINATED},
SegmentOrdinal,
};
use crate::{DocId, InvertedIndexReader, SegmentComponent};
use common::HasLen;
use itertools::Itertools;
use measure_time::debug_time;
use std::cmp;
use std::collections::HashMap;
use std::sync::Arc;
use tantivy_bitpacker::minmax;
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
@@ -143,7 +150,7 @@ impl TermOrdinalMapping {
.iter()
.flat_map(|term_ordinals| term_ordinals.iter().cloned().max())
.max()
.unwrap_or_else(TermOrdinal::default)
.unwrap_or_default()
}
}
@@ -215,7 +222,7 @@ impl IndexMerger {
let mut readers_with_min_sort_values = readers
.into_iter()
.map(|reader| {
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?;
Ok((reader, accessor.min_value()))
})
.collect::<crate::Result<Vec<_>>>()?;
@@ -233,33 +240,24 @@ impl IndexMerger {
fn write_fieldnorms(
&self,
mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
for field in fields {
fieldnorms_data.clear();
if let Some(doc_id_mapping) = doc_id_mapping {
let fieldnorms_readers: Vec<FieldNormReader> = self
.readers
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let fieldnorms_reader =
&fieldnorms_readers[reader_with_ordinal.ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
} else {
for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id);
}
}
let fieldnorms_readers: Vec<FieldNormReader> = self
.readers
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for (doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let fieldnorms_reader = &fieldnorms_readers[reader_with_ordinal.ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
}
fieldnorms_serializer.close()?;
@@ -270,8 +268,10 @@ impl IndexMerger {
&self,
fast_field_serializer: &mut CompositeFastFieldSerializer,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
debug_time!("write_fast_fields");
for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type();
match field_type {
@@ -319,7 +319,7 @@ impl IndexMerger {
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
let (min_value, max_value) = self.readers.iter().map(|reader|{
let u64_reader: DynamicFastFieldReader<u64> = reader
@@ -328,7 +328,7 @@ impl IndexMerger {
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
})
.filter_map(|x| x)
.flatten()
.reduce(|a, b| {
(a.0.min(b.0), a.1.max(b.1))
}).expect("Unexpected error, empty readers in IndexMerger");
@@ -344,68 +344,44 @@ impl IndexMerger {
u64_reader
})
.collect::<Vec<_>>();
if let Some(doc_id_mapping) = doc_id_mapping {
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get(&self, doc: DocId) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
}
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
};
let iter = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
fastfield_accessor,
iter.clone(),
iter,
)?;
Ok(())
} else {
let u64_readers = self.readers.iter()
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
.map(|reader|{
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
(reader.max_doc(), u64_reader, reader.delete_bitset())
}).collect::<Vec<_>>();
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
for (max_doc, u64_reader, delete_bitset_opt) in u64_readers {
for doc_id in 0u32..max_doc {
let is_deleted = delete_bitset_opt
.map(|delete_bitset| delete_bitset.is_deleted(doc_id))
.unwrap_or(false);
if !is_deleted {
let val = u64_reader.get(doc_id);
fast_single_field_serializer.add_val(val)?;
}
}
}
fast_single_field_serializer.close_field()?;
Ok(())
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocidMapping<'a>,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
}
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
};
let iter1 = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
let iter2 = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
fastfield_accessor,
iter1,
iter2,
)?;
Ok(())
}
/// Checks if the readers are disjunct for their sort property and in the correct order to be
@@ -434,7 +410,7 @@ impl IndexMerger {
reader: &SegmentReader,
sort_by_field: &IndexSortByField,
) -> crate::Result<impl FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor)
}
@@ -469,7 +445,7 @@ impl IndexMerger {
pub(crate) fn generate_doc_id_mapping(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
) -> crate::Result<SegmentDocidMapping> {
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression
@@ -505,7 +481,7 @@ impl IndexMerger {
})
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id))
.collect::<Vec<_>>();
Ok(sorted_doc_ids)
Ok(SegmentDocidMapping::new(sorted_doc_ids, false))
}
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
@@ -517,18 +493,18 @@ impl IndexMerger {
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
reader_and_field_accessors: &[(&SegmentReader, T)],
) -> crate::Result<()> {
) -> crate::Result<Vec<u64>> {
let mut total_num_vals = 0u64;
// In the first pass, we compute the total number of vals.
//
// This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking.
let mut idx_num_vals = 0;
let mut num_docs = 0;
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
if let Some(delete_bitset) = reader.delete_bitset() {
idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64;
num_docs += reader.max_doc() as u64 - delete_bitset.len() as u64;
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = u64s_reader.get_len(doc) as u64;
@@ -536,68 +512,50 @@ impl IndexMerger {
}
}
} else {
idx_num_vals += reader.max_doc() as u64;
num_docs += reader.max_doc() as u64;
total_num_vals += u64s_reader.get_total_len();
}
}
let stats = FastFieldStats {
max_value: total_num_vals,
num_vals: idx_num_vals,
// The fastfield offset index contains (num_docs + 1) values.
num_vals: num_docs + 1,
min_value: 0,
};
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
if let Some(doc_id_mapping) = doc_id_mapping {
// copying into a temp vec is not ideal, but the fast field codec api requires random
// access, which is used in the estimation. It's possible to 1. calculate random
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
offsets.push(offset);
offset += reader.get_len(*doc_id) as u64;
}
// copying into a temp vec is not ideal, but the fast field codec api requires random
// access, which is used in the estimation. It's possible to 1. calculate random
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping.iter() {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
} else {
let mut offsets = vec![];
let mut offset = 0;
for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() {
for doc in segment_reader.doc_ids_alive() {
offsets.push(offset);
offset += u64s_reader.get_len(doc) as u64;
}
}
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
offset += reader.get_len(*doc_id) as u64;
}
offsets.push(offset);
Ok(())
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets[..],
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
Ok(offsets)
}
/// Returns the fastfield index (index for the data, not the data).
fn write_multi_value_fast_field_idx(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Vec<u64>> {
let reader_and_field_accessors = self.readers.iter().map(|reader|{
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
.typed_fast_field_multi_reader(field)
@@ -618,9 +576,11 @@ impl IndexMerger {
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
debug_time!("write_hierarchical_facet_field");
// Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
// The second contains the actual values.
@@ -645,53 +605,61 @@ impl IndexMerger {
let mut serialize_vals =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals = Vec::with_capacity(100);
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(reader_with_ordinal.ordinal as usize);
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
ff_reader.get_vals(*old_doc_id, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
}
} else {
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(segment_ord);
let ff_reader = &fast_field_reader[segment_ord as usize];
// TODO optimize if no deletes
for doc in segment_reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
}
for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(reader_with_ordinal.ordinal as usize);
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
ff_reader.get_vals(*old_doc_id, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
}
serialize_vals.close_field()?;
}
Ok(())
}
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
/// sorting and the others
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocidMapping> {
let mapping: Vec<_> = self
.readers
.iter()
.enumerate()
.map(|(ordinal, reader)| {
let reader_with_ordinal = SegmentReaderWithOrdinal {
ordinal: ordinal as u32,
reader,
};
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_with_ordinal))
})
.flatten()
.collect();
Ok(SegmentDocidMapping::new(mapping, true))
}
fn write_multi_fast_field(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
// The second contains the actual values.
// First we merge the idx fast field.
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let offsets =
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let mut min_value = u64::max_value();
let mut max_value = u64::min_value();
let mut num_vals = 0;
let mut vals = Vec::with_capacity(100);
@@ -717,6 +685,7 @@ impl IndexMerger {
min_value = cmp::min(val, min_value);
max_value = cmp::max(val, max_value);
}
num_vals += vals.len();
}
ff_readers.push(ff_reader);
// TODO optimize when no deletes
@@ -727,40 +696,76 @@ impl IndexMerger {
max_value = 0;
}
let fast_field_reader = self
.readers
.iter()
.map(|reader| {
let ff_reader : MultiValuedFastFieldReader<u64> = reader.fast_fields()
.typed_fast_field_multi_reader(field)
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
ff_reader
})
.collect::<Vec<_>>();
// We can now initialize our serializer, and push it the different values
let mut serialize_vals =
fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
if let Some(doc_id_mapping) = doc_id_mapping {
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
ff_reader.get_vals(*doc_id, &mut vals);
for &val in &vals {
serialize_vals.add_val(val)?;
}
}
} else {
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
// TODO optimize if no deletes
for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
for &val in &vals {
serialize_vals.add_val(val)?;
}
}
let stats = FastFieldStats {
max_value,
num_vals: num_vals as u64,
min_value,
};
struct SortedDocidMultiValueAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocidMapping<'a>,
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
offsets: Vec<u64>,
}
impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> {
fn get_val(&self, pos: u64) -> u64 {
// use the offsets index to find the doc_id which will contain the position.
// the offsets are stricly increasing so we can do a simple search on it.
let new_docid = self
.offsets
.iter()
.position(|&offset| offset > pos)
.expect("pos is out of bounds")
- 1;
// now we need to find the position of `pos` in the multivalued bucket
let num_pos_covered_until_now = self.offsets[new_docid];
let pos_in_values = pos - num_pos_covered_until_now;
let (old_doc_id, reader_with_ordinal) = self.doc_id_mapping[new_docid as usize];
let num_vals = self.fast_field_readers[reader_with_ordinal.ordinal as usize]
.get_len(old_doc_id);
assert!(num_vals >= pos_in_values);
let mut vals = vec![];
self.fast_field_readers[reader_with_ordinal.ordinal as usize]
.get_vals(old_doc_id, &mut vals);
vals[pos_in_values as usize]
}
}
serialize_vals.close_field()?;
let fastfield_accessor = SortedDocidMultiValueAccessProvider {
doc_id_mapping,
fast_field_readers: &ff_readers,
offsets,
};
let iter1 = doc_id_mapping
.iter()
.map(|(doc_id, reader_with_ordinal)| {
let ff_reader = &ff_readers[reader_with_ordinal.ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
})
.flatten();
let iter2 = doc_id_mapping
.iter()
.map(|(doc_id, reader_with_ordinal)| {
let ff_reader = &ff_readers[reader_with_ordinal.ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
})
.flatten();
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
field,
stats,
fastfield_accessor,
iter1,
iter2,
1,
)?;
Ok(())
}
@@ -768,7 +773,7 @@ impl IndexMerger {
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
let reader_and_field_accessors = self
.readers
@@ -787,24 +792,13 @@ impl IndexMerger {
&reader_and_field_accessors,
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1);
if let Some(doc_id_mapping) = doc_id_mapping {
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let bytes_reader =
&reader_and_field_accessors[reader_with_ordinal.ordinal as usize].1;
let val = bytes_reader.get_bytes(*doc_id);
serialize_vals.write_all(val)?;
}
} else {
for segment_reader in &self.readers {
let bytes_reader = segment_reader.fast_fields().bytes(field)
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
// TODO: optimize if no deletes
for doc in segment_reader.doc_ids_alive() {
let val = bytes_reader.get_bytes(doc);
serialize_vals.write_all(val)?;
}
}
for (doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let bytes_reader = &reader_and_field_accessors[reader_with_ordinal.ordinal as usize].1;
let val = bytes_reader.get_bytes(*doc_id);
serialize_vals.write_all(val)?;
}
serialize_vals.flush()?;
Ok(())
}
@@ -815,8 +809,9 @@ impl IndexMerger {
field_type: &FieldType,
serializer: &mut InvertedIndexSerializer,
fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Option<TermOrdinalMapping>> {
debug_time!("write_postings_for_field");
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
@@ -841,40 +836,23 @@ impl IndexMerger {
};
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
if let Some(doc_id_mapping) = doc_id_mapping {
merged_doc_id_map = self
.readers
.iter()
.map(|reader| {
let mut segment_local_map = vec![];
segment_local_map.resize(reader.max_doc() as usize, None);
segment_local_map
})
.collect();
for (new_doc_id, (old_doc_id, segment_and_ordinal)) in doc_id_mapping.iter().enumerate()
{
let segment_map = &mut merged_doc_id_map[segment_and_ordinal.ordinal as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}
} else {
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
merged_doc_id_map.push(segment_local_map);
}
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = self
.readers
.iter()
.map(|reader| {
let mut segment_local_map = vec![];
segment_local_map.resize(reader.max_doc() as usize, None);
segment_local_map
})
.collect();
for (new_doc_id, (old_doc_id, segment_and_ordinal)) in doc_id_mapping.iter().enumerate() {
let segment_map = &mut merged_doc_id_map[segment_and_ordinal.ordinal as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}
// The total number of tokens will only be exact when there has been no deletes.
//
// Otherwise, we approximate by removing deleted documents proportionally.
@@ -970,7 +948,7 @@ impl IndexMerger {
// I think this is not strictly necessary, it would be possible to
// avoid the loading into a vec via some form of kmerge, but then the merge
// logic would deviate much more from the stacking case (unsorted index)
if doc_id_mapping.is_some() {
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.push((
remapped_doc_id,
term_freq,
@@ -985,14 +963,15 @@ impl IndexMerger {
doc = segment_postings.advance();
}
}
if doc_id_mapping.is_some() {
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
for (doc_id, term_freq, positions) in &doc_id_and_positions {
field_serializer.write_doc(*doc_id, *term_freq, positions);
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
}
doc_id_and_positions.clear();
}
// closing the term.
field_serializer.close_term()?;
}
@@ -1004,7 +983,7 @@ impl IndexMerger {
&self,
serializer: &mut InvertedIndexSerializer,
fieldnorm_readers: FieldNormReaders,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
let mut term_ordinal_mappings = HashMap::new();
for (field, field_entry) in self.schema.fields() {
@@ -1027,8 +1006,10 @@ impl IndexMerger {
fn write_storable_fields(
&self,
store_writer: &mut StoreWriter,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
debug_time!("write_storable_fields");
let store_readers: Vec<_> = self
.readers
.iter()
@@ -1039,8 +1020,8 @@ impl IndexMerger {
.enumerate()
.map(|(i, store)| store.iter_raw(self.readers[i].delete_bitset()))
.collect();
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
if !doc_id_mapping.is_trivial() {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
@@ -1084,25 +1065,24 @@ impl IndexMerger {
}
Ok(())
}
}
impl SerializableSegment for IndexMerger {
fn write(
&self,
mut serializer: SegmentSerializer,
_: Option<&DocIdMapping>,
) -> crate::Result<u32> {
/// Writes the merged segment by pushing information
/// to the `SegmentSerializer`.
///
/// # Returns
/// The number of documents in the resulting segment.
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
{
// If the documents are already sorted and stackable, we ignore the mapping and execute
// it as if there was no sorting
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
None
self.get_doc_id_from_concatenated_data()?
} else {
Some(self.generate_doc_id_mapping(sort_by_field)?)
self.generate_doc_id_mapping(sort_by_field)?
}
} else {
None
self.get_doc_id_from_concatenated_data()?
};
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
@@ -2102,4 +2082,11 @@ mod tests {
Ok(())
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
}

View File

@@ -1,6 +1,7 @@
#[cfg(test)]
mod tests {
use crate::fastfield::FastFieldReader;
use crate::fastfield::{DeleteBitSet, FastFieldReader};
use crate::schema::IndexRecordOption;
use crate::{
collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},
@@ -16,7 +17,7 @@ mod tests {
schema::{self, BytesOptions},
DocAddress,
};
use crate::{IndexSettings, Term};
use crate::{DocSet, IndexSettings, Postings, Term};
use futures::executor::block_on;
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
@@ -104,9 +105,11 @@ mod tests {
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
);
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme"));
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64),
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
);
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
);
assert!(index_writer.commit().is_ok());
@@ -118,7 +121,7 @@ mod tests {
} else {
1
};
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")));
assert!(index_writer.commit().is_ok());
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values {
@@ -243,6 +246,36 @@ mod tests {
assert_eq!(do_search("biggest"), vec![0]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
// access doc store
{
let blubber_pos = if force_disjunct_segment_sort_values {
@@ -260,6 +293,69 @@ mod tests {
}
}
#[test]
fn test_merge_unsorted_index() {
let index = create_test_index(
Some(IndexSettings {
..Default::default()
}),
false,
);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![1]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![4]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
@@ -314,7 +410,7 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
@@ -339,6 +435,34 @@ mod tests {
assert_eq!(do_search("biggest"), vec![5]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
),
2
);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();

View File

@@ -1,7 +1,7 @@
use crate::common::BitSet;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::indexer::delete_queue::DeleteCursor;
use common::BitSet;
use std::fmt;
/// A segment entry describes the state of

View File

@@ -32,6 +32,12 @@ impl SegmentRegisters {
} else if self.committed.contains_all(segment_ids) {
Some(SegmentsStatus::Committed)
} else {
warn!(
"segment_ids: {:?}, committed_ids: {:?}, uncommitted_ids {:?}",
segment_ids,
self.committed.segment_ids(),
self.uncommitted.segment_ids()
);
None
}
}
@@ -58,21 +64,6 @@ impl Debug for SegmentManager {
}
}
pub fn get_mergeable_segments(
in_merge_segment_ids: &HashSet<SegmentId>,
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(
registers_lock
.committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.get_mergeable_segments(in_merge_segment_ids),
)
}
impl SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
@@ -86,6 +77,20 @@ impl SegmentManager {
}
}
pub fn get_mergeable_segments(
&self,
in_merge_segment_ids: &HashSet<SegmentId>,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = self.read();
(
registers_lock
.committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.get_mergeable_segments(in_merge_segment_ids),
)
}
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let registers_lock = self.read();

View File

@@ -4,6 +4,7 @@ use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::segment_entry::SegmentEntry;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Display;
use std::fmt::{self, Debug, Formatter};
/// The segment register keeps track
@@ -29,6 +30,16 @@ impl Debug for SegmentRegister {
Ok(())
}
}
impl Display for SegmentRegister {
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
write!(f, "SegmentRegister(")?;
for k in self.segment_states.keys() {
write!(f, "{}, ", k.short_uuid_string())?;
}
write!(f, ")")?;
Ok(())
}
}
impl SegmentRegister {
pub fn clear(&mut self) {
@@ -46,6 +57,10 @@ impl SegmentRegister {
.collect()
}
pub fn segment_ids(&self) -> Vec<SegmentId> {
self.segment_states.keys().cloned().collect()
}
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
self.segment_states.values().cloned().collect()
}

View File

@@ -1,11 +1,10 @@
use super::segment_manager::{get_mergeable_segments, SegmentManager};
use super::segment_manager::SegmentManager;
use crate::core::Index;
use crate::core::IndexMeta;
use crate::core::IndexSettings;
use crate::core::Segment;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::core::SerializableSegment;
use crate::core::META_FILEPATH;
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::indexer::delete_queue::DeleteCursor;
@@ -140,7 +139,7 @@ fn merge(
// ... we just serialize this index merger in our new segment to merge the segments.
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?;
let num_docs = merger.write(segment_serializer, None)?;
let num_docs = merger.write(segment_serializer)?;
let merged_segment_id = merged_segment.id();
@@ -209,7 +208,7 @@ pub fn merge_segments<Dir: Directory>(
&segments[..],
)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let num_docs = merger.write(segment_serializer, None)?;
let num_docs = merger.write(segment_serializer)?;
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
@@ -528,10 +527,14 @@ impl SegmentUpdater {
}))
}
async fn consider_merge_options(&self) {
pub(crate) fn get_mergeable_segments(&self) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let merge_segment_ids: HashSet<SegmentId> = self.merge_operations.segment_in_merge();
let (committed_segments, uncommitted_segments) =
get_mergeable_segments(&merge_segment_ids, &self.segment_manager);
self.segment_manager
.get_mergeable_segments(&merge_segment_ids)
}
async fn consider_merge_options(&self) {
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
@@ -717,7 +720,7 @@ mod tests {
let seg_ids = index.searchable_segment_ids()?;
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
assert!(!seg_ids.is_empty());
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {

View File

@@ -12,12 +12,12 @@ use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::store::StoreReader;
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp;
use crate::{core::Segment, store::StoreWriter};
use crate::{core::SerializableSegment, store::StoreReader};
use crate::{DocId, SegmentComponent};
/// Computes the initial size of the hash table.
@@ -36,6 +36,20 @@ fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
}
}
fn remap_doc_opstamps(
opstamps: Vec<Opstamp>,
doc_id_mapping_opt: Option<&DocIdMapping>,
) -> Vec<Opstamp> {
if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
doc_id_mapping_opt
.iter_old_doc_ids()
.map(|doc| opstamps[doc as usize])
.collect()
} else {
opstamps
}
}
/// A `SegmentWriter` is in charge of creating segment index from a
/// set of documents.
///
@@ -112,14 +126,15 @@ impl SegmentWriter {
.clone()
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?;
write(
remap_and_write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
mapping.as_ref(),
)?;
Ok(self.doc_opstamps)
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
Ok(doc_opstamps)
}
pub fn mem_usage(&self) -> usize {
@@ -176,7 +191,7 @@ impl SegmentWriter {
.process(&mut |token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -237,7 +252,7 @@ impl SegmentWriter {
.u64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::Date(_) => {
@@ -248,7 +263,7 @@ impl SegmentWriter {
.date_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(date_val.timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::I64(_) => {
@@ -259,7 +274,7 @@ impl SegmentWriter {
.i64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::F64(_) => {
@@ -270,7 +285,7 @@ impl SegmentWriter {
.f64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::Bytes(_) => {
@@ -281,7 +296,7 @@ impl SegmentWriter {
.bytes_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
self.multifield_postings.subscribe(doc_id, &term_buffer);
self.multifield_postings.subscribe(doc_id, term_buffer);
}
}
}
@@ -315,8 +330,12 @@ impl SegmentWriter {
}
}
// This method is used as a trick to workaround the borrow checker
fn write(
/// This method is used as a trick to workaround the borrow checker
/// Writes a view of a segment by pushing information
/// to the `SegmentSerializer`.
///
/// `doc_id_map` is used to map to the new doc_id order.
fn remap_and_write(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
@@ -340,6 +359,7 @@ fn write(
&term_ord_map,
doc_id_map,
)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {
let store_write = serializer
@@ -356,31 +376,16 @@ fn write(
.segment()
.open_read(SegmentComponent::TempStore)?,
)?;
for old_doc_id in doc_id_map.iter_old_doc_ids() {
let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
serializer.get_store_writer().store_bytes(&doc_bytes)?;
}
}
serializer.close()?;
Ok(())
}
impl SerializableSegment for SegmentWriter {
fn write(
&self,
serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<u32> {
let max_doc = self.max_doc;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
doc_id_map,
)?;
Ok(max_doc)
}
serializer.close()?;
Ok(())
}
#[cfg(test)]

View File

@@ -1,6 +1,13 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![cfg_attr(
feature = "cargo-clippy",
allow(
clippy::module_inception,
clippy::needless_range_loop,
clippy::bool_assert_comparison
)
)]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -100,7 +107,6 @@
#[cfg_attr(test, macro_use)]
extern crate serde_json;
#[macro_use]
extern crate log;
@@ -129,7 +135,6 @@ pub type Result<T> = std::result::Result<T, TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common;
mod core;
mod indexer;
@@ -157,8 +162,6 @@ pub use self::snippet::{Snippet, SnippetGenerator};
mod docset;
pub use self::docset::{DocSet, TERMINATED};
pub use crate::common::HasLen;
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
@@ -172,6 +175,8 @@ pub use crate::indexer::IndexWriter;
pub use crate::postings::Postings;
pub use crate::reader::LeasedItem;
pub use crate::schema::{Document, Term};
pub use common::HasLen;
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use std::fmt;
use once_cell::sync::Lazy;
@@ -287,7 +292,7 @@ pub struct DocAddress {
}
#[cfg(test)]
mod tests {
pub mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
@@ -298,11 +303,18 @@ mod tests {
use crate::Index;
use crate::Postings;
use crate::ReloadPolicy;
use common::{BinarySerializable, FixedSize};
use rand::distributions::Bernoulli;
use rand::distributions::Uniform;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
/// Checks if left and right are close one to each other.
/// Panics if the two values are more than 0.5% apart.
#[macro_export]
@@ -934,7 +946,7 @@ mod tests {
let id = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let index_reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
@@ -973,7 +985,7 @@ mod tests {
let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher
.segment_readers()
.into_iter()
.iter()
.map(|reader| reader.segment_id())
.collect();
block_on(index_writer.merge(&segment_ids)).unwrap();
@@ -987,8 +999,24 @@ mod tests {
#[test]
fn test_validate_checksum() -> crate::Result<()> {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let mut builder = Schema::builder();
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?;
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"));
writer.add_document(doc!(body => "boo"));
}
writer.commit()?;
assert!(index.validate_checksum()?.is_empty());
// delete few docs
writer.delete_term(Term::from_field_text(body, "foo"));
writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
let _ = futures::executor::block_on(writer.merge(&segment_ids));
assert!(index.validate_checksum()?.is_empty());
Ok(())
}

View File

@@ -46,7 +46,7 @@ pub mod tests {
fn create_positions_data(vals: &[u32]) -> crate::Result<OwnedBytes> {
let mut positions_buffer = vec![];
let mut serializer = PositionSerializer::new(&mut positions_buffer);
serializer.write_positions_delta(&vals);
serializer.write_positions_delta(vals);
serializer.close_term()?;
serializer.close()?;
Ok(OwnedBytes::new(positions_buffer))
@@ -169,7 +169,7 @@ pub mod tests {
let positions_delta: Vec<u32> = (0..2_000_000).collect();
let positions_data = create_positions_data(&positions_delta[..])?;
assert_eq!(positions_data.len(), 5003499);
let mut position_reader = PositionReader::open(positions_data.clone())?;
let mut position_reader = PositionReader::open(positions_data)?;
let mut buf = [0u32; 256];
position_reader.read(128, &mut buf);
for i in 0..256 {

View File

@@ -1,9 +1,9 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::OwnedBytes;
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::{BlockDecoder, VIntDecoder};
use common::{BinarySerializable, VInt};
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.

View File

@@ -1,7 +1,7 @@
use crate::common::{BinarySerializable, CountingWriter, VInt};
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::BlockEncoder;
use crate::postings::compression::VIntEncoder;
use common::{BinarySerializable, CountingWriter, VInt};
use std::io::{self, Write};
/// The PositionSerializer is in charge of serializing all of the positions

View File

@@ -1,241 +1,109 @@
use std::ops::Range;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::AlignedBuffer;
unsafe fn binary_search_step(ptr: *const u32, target: u32, half_size: isize) -> *const u32 {
let mid = ptr.offset(half_size);
if *mid < target {
mid.offset(1)
} else {
ptr
}
}
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
/// Search the first index containing an element greater or equal to
/// the target.
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
unsafe {
let ptr = arr as *const AlignedBuffer as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search_sse2_128_u32() {
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
for el in 0u32..128u32 {
block[el as usize] = el * 2 + 1 << 18;
}
let target = block[64] + 1;
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return begin..pivot;
}
begin = pivot;
}
begin..end
}
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize {
let range = exponential_search(&block_docs, target);
range.start + linear_search(&block_docs[range], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
Sse2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the block regardless of whether the block is full or not.
///
/// Indeed, if the block is not full, the remaining items are TERMINATED.
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
if self == BlockSearcher::Sse2 {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
galloping(&block_docs.0[..], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::Sse2;
}
}
BlockSearcher::Scalar
/// the `start` argument is just used to hint that the response is
/// greater than beyond `start`. the implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// - The block is sorted. Some elements may appear several times. This is the case at the
/// end of the last block for instance.
/// - The target is assumed smaller or equal to the last element of the block.
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
let start_ptr: *const u32 = &arr[0] as *const u32;
unsafe {
let mut ptr = start_ptr;
ptr = binary_search_step(ptr, target, 63);
ptr = binary_search_step(ptr, target, 31);
ptr = binary_search_step(ptr, target, 15);
ptr = binary_search_step(ptr, target, 7);
ptr = binary_search_step(ptr, target, 3);
ptr = binary_search_step(ptr, target, 1);
let extra = if *ptr < target { 1 } else { 0 };
(ptr.offset_from(start_ptr) as usize) + extra
}
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
use super::branchless_binary_search;
use crate::docset::TERMINATED;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), 0..1);
assert_eq!(exponential_search(&[1, 2], 1), 0..1);
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
3..7
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
assert_eq!(
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
cursor
);
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use proptest::prelude::*;
use std::collections::HashSet;
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
fn util_test_search_in_block(block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(cursor < COMPRESSION_BLOCK_SIZE);
assert!(block[cursor] >= target);
if cursor > 0 {
assert!(block[cursor - 1] < target);
}
assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
}
fn util_test_search_in_block_all(block: &[u32]) {
let mut targets = HashSet::new();
targets.insert(0);
for &val in block {
if val > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block, target);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
fn test_search_in_branchless_binary_search() {
let v: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE).map(|i| i as u32 * 2).collect();
util_test_search_in_block_all(&v[..]);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::Sse2);
fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
let mut el = 0;
for i in 0..COMPRESSION_BLOCK_SIZE {
el += deltas[i];
deltas[i] = el;
}
deltas
})
}
proptest! {
#[test]
fn test_proptest_branchless_binary_search(block in monotonous_block()) {
util_test_search_in_block_all(&block[..]);
}
}
}

View File

@@ -1,23 +1,17 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
};
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
use common::{BinarySerializable, VInt};
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
if let Some(first) = it.next() {
Some(it.fold(first, Score::max))
} else {
None
}
it.next().map(|first| it.fold(first, Score::max))
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
@@ -213,9 +207,9 @@ impl BlockSegmentPostings {
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_aligned()
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block.
@@ -353,7 +347,6 @@ impl BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::BlockSegmentPostings;
use crate::common::HasLen;
use crate::core::Index;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -364,6 +357,7 @@ mod tests {
use crate::schema::Term;
use crate::schema::INDEXED;
use crate::DocId;
use common::HasLen;
#[test]
fn test_empty_segment_postings() {
@@ -482,11 +476,11 @@ mod tests {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
block_postings.seek(i);
for i in &[0, 424, 10000] {
block_postings.seek(*i);
let docs = block_postings.docs();
assert!(docs[0] <= i);
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
assert!(docs[0] <= *i);
assert!(docs.last().cloned().unwrap_or(0u32) >= *i);
}
block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);

View File

@@ -1,5 +1,5 @@
use crate::common::FixedSize;
use bitpacking::{BitPacker, BitPacker4x};
use common::FixedSize;
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES;
@@ -49,16 +49,10 @@ impl BlockEncoder {
}
}
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
#[derive(Clone)]
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
#[derive(Clone)]
pub struct BlockDecoder {
bitpacker: BitPacker4x,
output: AlignedBuffer,
output: [u32; COMPRESSION_BLOCK_SIZE],
pub output_len: usize,
}
@@ -72,7 +66,7 @@ impl BlockDecoder {
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
bitpacker: BitPacker4x::new(),
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
output: [val; COMPRESSION_BLOCK_SIZE],
output_len: 0,
}
}
@@ -85,28 +79,28 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(&compressed_data, &mut self.output.0, num_bits)
.decompress(compressed_data, &mut self.output, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output.0[..self.output_len]
&self.output[..self.output_len]
}
#[inline]
pub(crate) fn output_aligned(&self) -> &AlignedBuffer {
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
&self.output
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output.0[idx]
self.output[idx]
}
}
@@ -190,8 +184,8 @@ impl VIntDecoder for BlockDecoder {
padding: u32,
) -> usize {
self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
self.output.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted(
@@ -201,12 +195,12 @@ impl VIntDecoder for BlockDecoder {
padding: u32,
) -> usize {
self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
self.output.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]) {
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output.0);
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output);
self.output_len = num_els;
}
}
@@ -303,7 +297,7 @@ pub mod tests {
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::default();
let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len(), PADDING_VALUE);
decoder.uncompress_vint_sorted(encoded_data, *offset, input.len(), PADDING_VALUE);
assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array());
for i in input.len()..COMPRESSION_BLOCK_SIZE {

View File

@@ -3,6 +3,9 @@ Postings module (also called inverted index)
*/
mod block_search;
pub(crate) use self::block_search::branchless_binary_search;
mod block_segment_postings;
pub(crate) mod compression;
mod postings;
@@ -14,7 +17,6 @@ mod skip;
mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub use self::block_segment_postings::BlockSegmentPostings;
pub use self::postings::Postings;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
@@ -54,7 +56,7 @@ pub mod tests {
use crate::DocId;
use crate::HasLen;
use crate::Score;
use std::{iter, mem};
use std::mem;
#[test]
pub fn test_position_write() -> crate::Result<()> {
@@ -153,8 +155,8 @@ pub mod tests {
#[test]
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
exceeding_token_text.push_str(" hello");
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
@@ -164,7 +166,7 @@ pub mod tests {
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
@@ -229,7 +231,7 @@ pub mod tests {
segment_writer.add_document(op, &schema).unwrap();
}
for i in 2..1000 {
let mut text: String = iter::repeat("e ").take(i).collect();
let mut text: String = "e ".repeat(i);
text.push_str(" a");
let op = AddOperation {
opstamp: 2u64,
@@ -409,7 +411,7 @@ pub mod tests {
.unwrap();
for i in 0..num_docs / 2 - 1 {
assert!(segment_postings.seek(i * 2 + 1) > (i * 1) * 2);
assert!(segment_postings.seek(i * 2 + 1) > i * 2);
assert_eq!(segment_postings.doc(), (i + 1) * 2);
}
}
@@ -431,13 +433,10 @@ pub mod tests {
.read_postings(&term_2, IndexRecordOption::Basic)?
.unwrap();
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
if i % 2 == 0 {
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
assert!(segment_reader.is_deleted(i));
} else {
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
}
}

View File

@@ -11,7 +11,7 @@ use crate::docset::DocSet;
/// but other implementations mocking `SegmentPostings` exist,
/// for merging segments or for testing.
pub trait Postings: DocSet + 'static {
/// Returns the term frequency
/// The number of times the term appears in the document.
fn term_freq(&self) -> u32;
/// Returns the positions offseted with a given value.

View File

@@ -231,13 +231,7 @@ pub trait PostingsWriter {
// We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN {
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
self.subscribe(term_index, doc_id, token.position as u32, term_buffer, heap);
} else {
warn!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \

View File

@@ -1,10 +1,8 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::FieldSerializer;
use crate::DocId;
use crate::{
common::{read_u32_vint, write_u32_vint},
indexer::doc_id_mapping::DocIdMapping,
};
use common::{read_u32_vint, write_u32_vint};
const POSITION_END: u32 = 0;
@@ -282,7 +280,7 @@ impl Recorder for TfAndPositionRecorder {
doc_id_and_positions
.push((doc_id_map.get_new_doc_id(doc), buffer_positions.to_vec()));
} else {
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions);
serializer.write_doc(doc, buffer_positions.len() as u32, buffer_positions);
}
}
if doc_id_map.is_some() {

View File

@@ -1,12 +1,12 @@
use crate::common::HasLen;
use crate::docset::DocSet;
use crate::fastfield::DeleteBitSet;
use crate::positions::PositionReader;
use crate::postings::branchless_binary_search;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::BlockSearcher;
use crate::postings::BlockSegmentPostings;
use crate::postings::Postings;
use crate::{DocId, TERMINATED};
use common::HasLen;
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
@@ -18,7 +18,6 @@ pub struct SegmentPostings {
pub(crate) block_cursor: BlockSegmentPostings,
cur: usize,
position_reader: Option<PositionReader>,
block_searcher: BlockSearcher,
}
impl SegmentPostings {
@@ -28,7 +27,6 @@ impl SegmentPostings {
block_cursor: BlockSegmentPostings::empty(),
cur: 0,
position_reader: None,
block_searcher: BlockSearcher::default(),
}
}
@@ -107,7 +105,7 @@ impl SegmentPostings {
let fieldnorm_reader = fieldnorms.map(FieldNormReader::for_test);
let average_field_norm = fieldnorms
.map(|fieldnorms| {
if fieldnorms.len() == 0 {
if fieldnorms.is_empty() {
return 0.0;
}
let total_num_tokens: u64 = fieldnorms
@@ -154,7 +152,6 @@ impl SegmentPostings {
block_cursor: segment_block_postings,
cur: 0, // cursor within the block
position_reader,
block_searcher: BlockSearcher::default(),
}
}
}
@@ -183,8 +180,8 @@ impl DocSet for SegmentPostings {
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.docs_aligned();
self.cur = self.block_searcher.search_in_block(&output, target);
let output = self.block_cursor.full_block();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
@@ -197,7 +194,7 @@ impl DocSet for SegmentPostings {
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur];
let doc = output[self.cur];
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
@@ -268,7 +265,7 @@ impl Postings for SegmentPostings {
mod tests {
use super::SegmentPostings;
use crate::common::HasLen;
use common::HasLen;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::DeleteBitSet;

View File

@@ -1,7 +1,6 @@
use super::TermInfo;
use crate::common::{BinarySerializable, VInt};
use crate::common::{CompositeWrite, CountingWriter};
use crate::core::Segment;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionSerializer;
@@ -12,6 +11,8 @@ use crate::schema::{Field, FieldEntry, FieldType};
use crate::schema::{IndexRecordOption, Schema};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{DocId, Score};
use common::CountingWriter;
use common::{BinarySerializable, VInt};
use std::cmp::Ordering;
use std::io::{self, Write};
@@ -356,7 +357,7 @@ impl<W: Write> PostingsSerializer<W> {
// encode the doc ids
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
.compress_block_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
@@ -366,7 +367,7 @@ impl<W: Write> PostingsSerializer<W> {
if self.mode.has_freq() {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(&self.block.term_freqs());
.compress_block_unsorted(self.block.term_freqs());
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
@@ -426,7 +427,7 @@ impl<W: Write> PostingsSerializer<W> {
{
let block_encoded = self
.block_encoder
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
.compress_vint_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
@@ -442,10 +443,8 @@ impl<W: Write> PostingsSerializer<W> {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
self.output_write.write_all(skip_data)?;
self.output_write.write_all(&self.postings_write[..])?;
} else {
self.output_write.write_all(&self.postings_write[..])?;
}
self.output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;

View File

@@ -280,11 +280,8 @@ mod tests {
if v.len() >= 10 {
break;
}
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
v.push((i, cap));
}
_ => {}
if let CapacityResult::NeedAlloc(cap) = len_to_capacity(i) {
v.push((i, cap));
}
}
assert_eq!(

View File

@@ -151,7 +151,7 @@ impl TermHashMap {
pub fn iter(&self) -> Iter<'_> {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
hashmap: self,
}
}
@@ -261,8 +261,8 @@ mod tests {
}
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let iter_values = hash_map.iter();
for (key, addr, _) in iter_values {
let val: u32 = hash_map.heap.read(addr);
vanilla_hash_map.insert(key.to_owned(), val);
}

View File

@@ -1,4 +1,4 @@
use crate::common::{BinarySerializable, FixedSize};
use common::{BinarySerializable, FixedSize};
use std::io;
use std::iter::ExactSizeIterator;
use std::ops::Range;
@@ -67,7 +67,7 @@ impl BinarySerializable for TermInfo {
mod tests {
use super::TermInfo;
use crate::common::test::fixed_size_test;
use crate::tests::fixed_size_test;
// TODO add serialize/deserialize test for terminfo

View File

@@ -1,4 +1,3 @@
use crate::common::BitSet;
use crate::core::SegmentReader;
use crate::query::ConstScorer;
use crate::query::{BitSetDocSet, Explanation};
@@ -7,6 +6,7 @@ use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::TantivyError;
use crate::{DocId, Score};
use common::BitSet;
use std::io;
use std::sync::Arc;
use tantivy_fst::Automaton;
@@ -121,10 +121,7 @@ mod tests {
}
fn is_match(&self, state: &Self::State) -> bool {
match *state {
State::AfterA => true,
_ => false,
}
matches!(*state, State::AfterA)
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {

View File

@@ -1,6 +1,6 @@
use crate::common::{BitSet, TinySet};
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
use common::{BitSet, TinySet};
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
///
@@ -96,10 +96,13 @@ impl DocSet for BitSetDocSet {
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;
use super::BitSetDocSet;
use crate::common::BitSet;
use crate::docset::{DocSet, TERMINATED};
use crate::tests::generate_nonunique_unsorted;
use crate::DocId;
use common::BitSet;
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
let mut docset = BitSet::with_max_value(max_doc);
@@ -109,6 +112,29 @@ mod tests {
BitSetDocSet::from(docset)
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {
assert!(remaining);
assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
}
assert!(!remaining);
}
#[test]
fn test_empty() {
let bitset = BitSet::with_max_value(1000);

View File

@@ -251,7 +251,7 @@ mod tests {
impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal
self.cmp(other) == Ordering::Equal
}
}
@@ -289,7 +289,7 @@ mod tests {
if !nearly_equals(score, limit) {
checkpoints.push((doc, score));
}
return limit;
limit
});
checkpoints
}
@@ -368,10 +368,10 @@ mod tests {
.iter()
.map(|posting_list| {
posting_list
.into_iter()
.iter()
.cloned()
.flat_map(|(doc, term_freq)| {
(0 as u32..REPEAT as u32).map(move |offset| {
(0_u32..REPEAT as u32).map(move |offset| {
(
doc * (REPEAT as u32) + offset,
if offset == 0 { term_freq } else { 1 },

View File

@@ -310,7 +310,7 @@ mod tests {
));
let query = BooleanQuery::from(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32))?;
assert_nearly_equals!(explanation.value(), 0.6931472);
assert_nearly_equals!(explanation.value(), std::f32::consts::LN_2);
Ok(())
}
}

View File

@@ -329,7 +329,7 @@ impl MoreLikeThis {
continue;
}
let doc_freq = searcher.doc_freq(&term)?;
let doc_freq = searcher.doc_freq(term)?;
// ignore terms with less than min_doc_frequency
if self

Some files were not shown because too many files have changed in this diff Show More