Compare commits

..

2 Commits

Author SHA1 Message Date
Pascal Seitz
80538175e8 fix build 2024-10-16 10:33:24 +08:00
dependabot[bot]
8dc942e8e7 Update binggan requirement from 0.10.0 to 0.12.0
---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-10-15 20:05:11 +00:00
342 changed files with 7013 additions and 21133 deletions

View File

@@ -15,11 +15,11 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Install Rust - name: Install Rust
run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2 - uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage - name: Generate code coverage
run: cargo +nightly-2025-12-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v3 uses: codecov/codecov-action@v3
continue-on-error: true continue-on-error: true

View File

@@ -39,11 +39,11 @@ jobs:
- name: Check Formatting - name: Check Formatting
run: cargo +nightly fmt --all -- --check run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation - name: Check Stable Compilation
run: cargo build --all-features run: cargo build --all-features
- name: Check Bench Compilation - name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features run: cargo +nightly bench --no-run --profile=dev --all-features
@@ -59,10 +59,10 @@ jobs:
strategy: strategy:
matrix: matrix:
features: features: [
- { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints,stemmer" } { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
- { label: "quickwit", flags: "mmap,quickwit,failpoints" } { label: "quickwit", flags: "mmap,quickwit,failpoints" }
- { label: "none", flags: "" } ]
name: test-${{ matrix.features.label}} name: test-${{ matrix.features.label}}
@@ -80,21 +80,7 @@ jobs:
- uses: Swatinem/rust-cache@v2 - uses: Swatinem/rust-cache@v2
- name: Run tests - name: Run tests
run: | run: cargo +stable nextest run --features ${{ matrix.features.flags }} --verbose --workspace
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
# (as most of them rely on mmap) otherwise run all
if [ -z "${{ matrix.features.flags }}" ]; then
cargo +stable nextest run --lib --no-default-features --verbose --workspace
else
cargo +stable nextest run --features ${{ matrix.features.flags }} --no-default-features --verbose --workspace
fi
- name: Run doctests - name: Run doctests
run: | run: cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
# (as most of them rely on mmap) otherwise run all
if [ -z "${{ matrix.features.flags }}" ]; then
echo "no doctest for no feature flag"
else
cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
fi

View File

@@ -46,7 +46,7 @@ The file of a segment has the format
```segment-id . ext``` ```segment-id . ext```
The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file. The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema. A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.
Tantivy's document follows a very strict schema, decided before building any index. Tantivy's document follows a very strict schema, decided before building any index.
The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy. The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
Depending on the type of the field, you can decide to Depending on the type of the field, you can decide to

View File

@@ -1,42 +1,11 @@
Tantivy 0.25 Tantivy 0.23 - Unreleased
================================ ================================
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21.
## Bugfixes
- fix union performance regression in tantivy 0.24 [#2663](https://github.com/quickwit-oss/tantivy/pull/2663)(@PSeitz)
- make zstd optional in sstable [#2633](https://github.com/quickwit-oss/tantivy/pull/2633)(@Parth)
- Fix TopDocs::order_by_string_fast_field for asc order [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
## Features/Improvements
- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/tantivy/pull/2660)(@PSeitz)
- Add string fast field support to `TopDocs`. [#2642](https://github.com/quickwit-oss/tantivy/pull/2642)(@stuhood)
- update edition to 2024 [#2620](https://github.com/quickwit-oss/tantivy/pull/2620)(@PSeitz)
- Allow optional spaces between the field name and the value in the query parser [#2678](https://github.com/quickwit-oss/tantivy/pull/2678)(@Darkheir)
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
Tantivy 0.24.2
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.24.1
================================
- Fix: bump required rust version to 1.81
Tantivy 0.24
================================
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
#### Bugfixes #### Bugfixes
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz) - fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton) - fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz) - fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
- fix `OwnedBytes` debug panic [#2512](https://github.com/quickwit-oss/tantivy/pull/2512)(@b41sh)
- catch panics during merges [#2582](https://github.com/quickwit-oss/tantivy/pull/2582)(@rdettai)
- switch from u32 to usize in bitpacker. This enables multivalued columns larger than 4GB, which crashed during merge before. [#2581](https://github.com/quickwit-oss/tantivy/pull/2581) [#2586](https://github.com/quickwit-oss/tantivy/pull/2586)(@fulmicoton-dd @PSeitz)
#### Breaking API Changes #### Breaking API Changes
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz) - remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
@@ -54,7 +23,6 @@ Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0
- reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz) - reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz)
- check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz) - check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz)
- Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz) - Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
- add support for counting non integer in aggregation [#2547](https://github.com/quickwit-oss/tantivy/pull/2547)(@trinity-1686a)
- **Range Queries** - **Range Queries**
- Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz) - Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz)
- Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz) - Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz)
@@ -65,20 +33,11 @@ Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0
- add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz) - add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz)
- Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi) - Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi)
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a) - make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
- Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW) - feat(query): Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
- Make `NUM_MERGE_THREADS` configurable [#2535](https://github.com/quickwit-oss/tantivy/pull/2535)(@Barre)
- **RegexPhraseQuery** - **Optional Index in Multivalue Columnar Index** For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case). This is alleviated by placing an optional index in the multivalued index to mark documents that have values. This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
`RegexPhraseQuery` supports phrase queries with regex. E.g. query "b.* b.* wolf" matches "big bad wolf". Slop is supported as well: "b.* wolf"~2 matches "big bad wolf" [#2516](https://github.com/quickwit-oss/tantivy/pull/2516)(@PSeitz)
- **Optional Index in Multivalue Columnar Index** - **Performace/Memory**
For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case).
This is alleviated by placing an optional index in the multivalued index to mark documents that have values.
This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
- **Performance/Memory**
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz) - lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton) - Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz) - Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
@@ -92,29 +51,18 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
- fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a) - fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a)
- improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a) - improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a)
- Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a) - Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a)
- allow term starting with wildcard [#2568](https://github.com/quickwit-oss/tantivy/pull/2568)(@trinity-1686a)
- Exist queries match subpath fields [#2558](https://github.com/quickwit-oss/tantivy/pull/2558)(@rdettai)
- add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz) - add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz)
- extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz) - extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz)
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz) - add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton) - Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton) - Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
- use bingang for agg and stacker benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)[#2492](https://github.com/quickwit-oss/tantivy/pull/2492)(@PSeitz) - use bingang for agg benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)(@PSeitz)
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz) - cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz) - make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz) - remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz) - validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold) - Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
Tantivy 0.22.1
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.22 Tantivy 0.22
================================ ================================
@@ -769,7 +717,7 @@ Tantivy 0.4.0
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton) - Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton) - Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
- Optimized skip in SegmentPostings (#130) (@lnicola) - Optimized skip in SegmentPostings (#130) (@lnicola)
- Replacing rustc_serialize by serde. Kudos to benchmark@KodrAus and @lnicola - Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- Using error-chain (@KodrAus) - Using error-chain (@KodrAus)
- QueryParser: (@fulmicoton) - QueryParser: (@fulmicoton)
- Explicit error returned when searched for a term that is not indexed - Explicit error returned when searched for a term that is not indexed

View File

@@ -1,10 +0,0 @@
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- alias: Quickwit Inc.
website: "https://quickwit.io"
title: "tantivy"
version: 0.22.0
doi: 10.5281/zenodo.13942948
date-released: 2024-10-17
url: "https://github.com/quickwit-oss/tantivy"

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.26.0" version = "0.23.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2021" edition = "2021"
rust-version = "1.85" rust-version = "1.66"
exclude = ["benches/*.json", "benches/*.txt"] exclude = ["benches/*.json", "benches/*.txt"]
[dependencies] [dependencies]
@@ -31,20 +31,20 @@ lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false } zstd = { version = "0.13", optional = true, default-features = false }
tempfile = { version = "3.12.0", optional = true } tempfile = { version = "3.12.0", optional = true }
log = "0.4.16" log = "0.4.16"
serde = { version = "1.0.219", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.140" serde_json = "1.0.79"
fs4 = { version = "0.13.1", optional = true } fs4 = { version = "0.8.0", optional = true }
levenshtein_automata = "0.2.1" levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] } uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4" crossbeam-channel = "0.5.4"
rust-stemmers = { version = "1.2.0", optional = true } rust-stemmers = "1.2.0"
downcast-rs = "2.0.1" downcast-rs = "1.2.1"
bitpacking = { version = "0.9.2", default-features = false, features = [ bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x", "bitpacker4x",
] } ] }
census = "0.4.2" census = "0.4.2"
rustc-hash = "2.0.0" rustc-hash = "1.1.0"
thiserror = "2.0.1" thiserror = "1.0.30"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true } fail = { version = "0.5.0", optional = true }
time = { version = "0.3.35", features = ["serde-well-known"] } time = { version = "0.3.35", features = ["serde-well-known"] }
@@ -52,35 +52,32 @@ smallvec = "1.8.0"
rayon = "1.5.2" rayon = "1.5.2"
lru = "0.12.0" lru = "0.12.0"
fastdivide = "0.4.0" fastdivide = "0.4.0"
itertools = "0.14.0" itertools = "0.13.0"
measure_time = "0.9.0" measure_time = "0.8.2"
arc-swap = "1.5.0" arc-swap = "1.5.0"
bon = "3.3.1"
columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" } columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true } sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true }
stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" } stacker = { version = "0.3", path = "./stacker", package = "tantivy-stacker" }
query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" } query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" } tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.10", path = "./common/", package = "tantivy-common" } common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" } tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] } sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
futures-util = { version = "0.3.28", optional = true } futures-util = { version = "0.3.28", optional = true }
futures-channel = { version = "0.3.28", optional = true }
fnv = "1.0.7" fnv = "1.0.7"
typetag = "0.2.21"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
[dev-dependencies] [dev-dependencies]
binggan = "0.14.2" binggan = "0.12.0"
rand = "0.8.5" rand = "0.8.5"
maplit = "1.0.2" maplit = "1.0.2"
matches = "0.1.9" matches = "0.1.9"
pretty_assertions = "1.2.1" pretty_assertions = "1.2.1"
proptest = "1.7.0" proptest = "1.0.0"
test-log = "0.2.10" test-log = "0.2.10"
futures = "0.3.21" futures = "0.3.21"
paste = "1.0.11" paste = "1.0.11"
@@ -88,7 +85,7 @@ more-asserts = "0.3.1"
rand_distr = "0.4.3" rand_distr = "0.4.3"
time = { version = "0.3.10", features = ["serde-well-known", "macros"] } time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [ postcard = { version = "1.0.4", features = [
"use-std", "use-std",
], default-features = false } ], default-features = false }
[target.'cfg(not(windows))'.dev-dependencies] [target.'cfg(not(windows))'.dev-dependencies]
@@ -113,21 +110,17 @@ debug-assertions = true
overflow-checks = true overflow-checks = true
[features] [features]
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression", "stemmer"] default = ["mmap", "stopwords", "lz4-compression"]
stemmer = ["rust-stemmers"]
mmap = ["fs4", "tempfile", "memmap2"] mmap = ["fs4", "tempfile", "memmap2"]
stopwords = [] stopwords = []
lz4-compression = ["lz4_flex"] lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"] zstd-compression = ["zstd"]
# enable zstd-compression in columnar (and sstable)
columnar-zstd-compression = ["columnar/zstd-compression"]
failpoints = ["fail", "fail/failpoints"] failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util", "futures-channel"] quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data. # Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision. # Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
@@ -169,23 +162,3 @@ harness = false
[[bench]] [[bench]]
name = "agg_bench" name = "agg_bench"
harness = false harness = false
[[bench]]
name = "exists_json"
harness = false
[[bench]]
name = "range_query"
harness = false
[[bench]]
name = "and_or_queries"
harness = false
[[bench]]
name = "range_queries"
harness = false
[[bench]]
name = "bool_queries_with_range"
harness = false

View File

@@ -23,6 +23,8 @@ performance for different types of queries/collections.
Your mileage WILL vary depending on the nature of queries and their load. Your mileage WILL vary depending on the nature of queries and their load.
<img src="doc/assets/images/searchbenchmark.png">
Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game). Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
## Features ## Features
@@ -123,7 +125,6 @@ You can also find other bindings on [GitHub](https://github.com/search?q=tantivy
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer - [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby - [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API - [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
- [Bichon](https://github.com/rustmailer/bichon): A lightweight, high-performance Rust email archiver with WebUI
- and [more](https://github.com/search?q=tantivy)! - and [more](https://github.com/search?q=tantivy)!
### On average, how much faster is Tantivy compared to Lucene? ### On average, how much faster is Tantivy compared to Lucene?

View File

@@ -1,4 +1,4 @@
# Releasing a new Tantivy Version # Release a new Tantivy Version
## Steps ## Steps
@@ -10,29 +10,12 @@
6. Set git tag with new version 6. Set git tag with new version
[`cargo-release`](https://github.com/crate-ci/cargo-release) will help us with steps 1-5: In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
Set new packages to version 0.0.0
Replace prev-tag-name Replace prev-tag-name
```bash ```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
``` ```
`no-tag` or it will create tags for all the subpackages no-tag or it will create tags for all the subpackages
cargo release will _not_ ignore unchanged packages, but it will print warnings for them.
e.g. "warning: updating ownedbytes to 0.10.0 despite no changes made since tag 0.24"
We need to manually ignore these unchanged packages
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag --exclude tokenizer-api
```
Add `--execute` to actually publish the packages, otherwise it will only print the commands that would be run.
### Tag Version
```bash
git tag 0.25.0
git push upstream tag 0.25.0
```

View File

@@ -10,7 +10,7 @@ rename FastFieldReaders::open to load
remove fast field reader remove fast field reader
find a way to unify the two DateTime. find a way to unify the two DateTime.
re-add type check in the filter wrapper readd type check in the filter wrapper
add unit test on columnar list columns. add unit test on columnar list columns.

View File

@@ -1,6 +1,5 @@
use binggan::plugins::PeakMemAllocPlugin; use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM}; use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::distributions::WeightedIndex;
use rand::prelude::SliceRandom; use rand::prelude::SliceRandom;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
@@ -21,6 +20,7 @@ macro_rules! register {
($runner:expr, $func:ident) => { ($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| { $runner.register(stringify!($func), move |index| {
$func(index); $func(index);
None
}) })
}; };
} }
@@ -54,41 +54,26 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, stats_f64); register!(group, stats_f64);
register!(group, extendedstats_f64); register!(group, extendedstats_f64);
register!(group, percentiles_f64); register!(group, percentiles_f64);
register!(group, terms_7); register!(group, terms_few);
register!(group, terms_all_unique); register!(group, terms_many);
register!(group, terms_150_000);
register!(group, terms_many_top_1000); register!(group, terms_many_top_1000);
register!(group, terms_many_order_by_term); register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits); register!(group, terms_many_with_top_hits);
register!(group, terms_all_unique_with_avg_sub_agg);
register!(group, terms_many_with_avg_sub_agg); register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_status_with_avg_sub_agg);
register!(group, terms_status_with_histogram);
register!(group, terms_zipf_1000);
register!(group, terms_zipf_1000_with_histogram);
register!(group, terms_zipf_1000_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_avg_sub_agg); register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
register!(group, cardinality_agg); register!(group, cardinality_agg);
register!(group, terms_status_with_cardinality_agg); register!(group, terms_few_with_cardinality_agg);
register!(group, range_agg); register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg); register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_status); register!(group, range_agg_with_term_agg_few);
register!(group, range_agg_with_term_agg_many); register!(group, range_agg_with_term_agg_many);
register!(group, histogram); register!(group, histogram);
register!(group, histogram_hard_bounds); register!(group, histogram_hard_bounds);
register!(group, histogram_with_avg_sub_agg); register!(group, histogram_with_avg_sub_agg);
register!(group, histogram_with_term_agg_status);
register!(group, avg_and_range_with_avg_sub_agg); register!(group, avg_and_range_with_avg_sub_agg);
// Filter aggregation benchmarks
register!(group, filter_agg_all_query_count_agg);
register!(group, filter_agg_term_query_count_agg);
register!(group, filter_agg_all_query_with_sub_aggs);
register!(group, filter_agg_term_query_with_sub_aggs);
group.run(); group.run();
} }
@@ -139,12 +124,12 @@ fn extendedstats_f64(index: &Index) {
} }
fn percentiles_f64(index: &Index) { fn percentiles_f64(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"mypercentiles": { "mypercentiles": {
"percentiles": { "percentiles": {
"field": "score_f64", "field": "score_f64",
"percents": [ 95, 99, 99.9 ] "percents": [ 95, 99, 99.9 ]
}
} }
}
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
@@ -159,10 +144,10 @@ fn cardinality_agg(index: &Index) {
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_status_with_cardinality_agg(index: &Index) { fn terms_few_with_cardinality_agg(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "my_texts": {
"terms": { "field": "text_few_terms_status" }, "terms": { "field": "text_few_terms" },
"aggs": { "aggs": {
"cardinality": { "cardinality": {
"cardinality": { "cardinality": {
@@ -175,20 +160,13 @@ fn terms_status_with_cardinality_agg(index: &Index) {
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_7(index: &Index) { fn terms_few(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms_status" } }, "my_texts": { "terms": { "field": "text_few_terms" } },
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_all_unique(index: &Index) { fn terms_many(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_all_unique_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_150_000(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms" } }, "my_texts": { "terms": { "field": "text_many_terms" } },
}); });
@@ -236,72 +214,6 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_all_unique_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_all_unique_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_status_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms_status" },
"aggs": {
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
}
}
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_1000_terms_zipf" },
"aggs": {
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
}
}
});
execute_agg(index, agg_req);
}
fn terms_status_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms_status" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_1000_terms_zipf" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_1000_terms_zipf" } },
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) { fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "my_texts": {
@@ -357,7 +269,7 @@ fn range_agg_with_avg_sub_agg(index: &Index) {
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn range_agg_with_term_agg_status(index: &Index) { fn range_agg_with_term_agg_few(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"rangef64": { "rangef64": {
"range": { "range": {
@@ -372,7 +284,7 @@ fn range_agg_with_term_agg_status(index: &Index) {
] ]
}, },
"aggs": { "aggs": {
"my_texts": { "terms": { "field": "text_few_terms_status" } }, "my_texts": { "terms": { "field": "text_few_terms" } },
} }
}, },
}); });
@@ -428,17 +340,6 @@ fn histogram_with_avg_sub_agg(index: &Index) {
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn histogram_with_term_agg_status(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 10 },
"aggs": {
"my_texts": { "terms": { "field": "text_few_terms_status" } }
}
}
});
execute_agg(index, agg_req);
}
fn avg_and_range_with_avg_sub_agg(index: &Index) { fn avg_and_range_with_avg_sub_agg(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"rangef64": { "rangef64": {
@@ -478,13 +379,6 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
} }
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> { fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
// Flag to use existing index
let reuse_index = std::env::var("REUSE_AGG_BENCH_INDEX").is_ok();
if reuse_index && std::path::Path::new("agg_bench").exists() {
return Index::open_in_dir("agg_bench");
}
// crreate dir
std::fs::create_dir_all("agg_bench")?;
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default() let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options( .set_indexing_options(
@@ -493,47 +387,20 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
.set_stored(); .set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype); let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST); let json_field = schema_builder.add_json_field("json", FAST);
let text_field_all_unique_terms =
schema_builder.add_text_field("text_all_unique_terms", STRING | FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST); let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms_status = let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
schema_builder.add_text_field("text_few_terms_status", STRING | FAST);
let text_field_1000_terms_zipf =
schema_builder.add_text_field("text_1000_terms_zipf", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast(); let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone()); let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone()); let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype); let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
// use tmp dir let index = Index::create_from_tempdir(schema_builder.build())?;
let index = if reuse_index { let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
Index::create_in_dir("agg_bench", schema_builder.build())?
} else {
Index::create_from_tempdir(schema_builder.build())?
};
// Approximate log proportions
let status_field_data = [
("INFO", 8000),
("ERROR", 300),
("WARN", 1200),
("DEBUG", 500),
("OK", 500),
("CRITICAL", 20),
("EMERGENCY", 1),
];
let log_level_distribution =
WeightedIndex::new(status_field_data.iter().map(|item| item.1)).unwrap();
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap(); let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000) let many_terms_data = (0..150_000)
.map(|num| format!("author{num}")) .map(|num| format!("author{num}"))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
// Prepare 1000 unique terms sampled using a Zipf distribution.
// Exponent ~1.1 approximates top-20 terms covering around ~20%.
let terms_1000: Vec<String> = (1..=1000).map(|i| format!("term_{i}")).collect();
let zipf_1000 = rand_distr::Zipf::new(1000, 1.1f64).unwrap();
{ {
let mut rng = StdRng::from_seed([1u8; 32]); let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
@@ -543,25 +410,15 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
index_writer.add_document(doc!())?; index_writer.add_document(doc!())?;
} }
if cardinality == Cardinality::Multivalued { if cardinality == Cardinality::Multivalued {
let log_level_sample_a = status_field_data[log_level_distribution.sample(&mut rng)].0;
let log_level_sample_b = status_field_data[log_level_distribution.sample(&mut rng)].0;
let idx_a = zipf_1000.sample(&mut rng) as usize - 1;
let idx_b = zipf_1000.sample(&mut rng) as usize - 1;
let term_1000_a = &terms_1000[idx_a];
let term_1000_b = &terms_1000[idx_b];
index_writer.add_document(doc!( index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}), json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}), json_field => json!({"mixed_type": 10.0}),
text_field => "cool", text_field => "cool",
text_field => "cool", text_field => "cool",
text_field_all_unique_terms => "cool",
text_field_all_unique_terms => "coolo",
text_field_many_terms => "cool", text_field_many_terms => "cool",
text_field_many_terms => "cool", text_field_many_terms => "cool",
text_field_few_terms_status => log_level_sample_a, text_field_few_terms => "cool",
text_field_few_terms_status => log_level_sample_b, text_field_few_terms => "cool",
text_field_1000_terms_zipf => term_1000_a.as_str(),
text_field_1000_terms_zipf => term_1000_b.as_str(),
score_field => 1u64, score_field => 1u64,
score_field => 1u64, score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng), score_field_f64 => lg_norm.sample(&mut rng),
@@ -586,10 +443,8 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "cool", text_field => "cool",
json_field => json, json_field => json,
text_field_all_unique_terms => format!("unique_term_{}", rng.gen::<u64>()),
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(), text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms_status => status_field_data[log_level_distribution.sample(&mut rng)].0, text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_1000_terms_zipf => terms_1000[zipf_1000.sample(&mut rng) as usize - 1].as_str(),
score_field => val as u64, score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng), score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64, score_field_i64 => val as i64,
@@ -606,61 +461,3 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
Ok(index) Ok(index)
} }
// Filter aggregation benchmarks
fn filter_agg_all_query_count_agg(index: &Index) {
let agg_req = json!({
"filtered": {
"filter": "*",
"aggs": {
"count": { "value_count": { "field": "score" } }
}
}
});
execute_agg(index, agg_req);
}
fn filter_agg_term_query_count_agg(index: &Index) {
let agg_req = json!({
"filtered": {
"filter": "text:cool",
"aggs": {
"count": { "value_count": { "field": "score" } }
}
}
});
execute_agg(index, agg_req);
}
fn filter_agg_all_query_with_sub_aggs(index: &Index) {
let agg_req = json!({
"filtered": {
"filter": "*",
"aggs": {
"avg_score": { "avg": { "field": "score" } },
"stats_score": { "stats": { "field": "score_f64" } },
"terms_text": {
"terms": { "field": "text_few_terms_status" }
}
}
}
});
execute_agg(index, agg_req);
}
fn filter_agg_term_query_with_sub_aggs(index: &Index) {
let agg_req = json!({
"filtered": {
"filter": "text:cool",
"aggs": {
"avg_score": { "avg": { "field": "score" } },
"stats_score": { "stats": { "field": "score_f64" } },
"terms_text": {
"terms": { "field": "text_few_terms_status" }
}
}
}
});
execute_agg(index, agg_req);
}

View File

@@ -1,218 +0,0 @@
// Benchmarks boolean conjunction queries using binggan.
//
// Whats measured:
// - Or and And queries with varying selectivity (only `Term` queries for now on leafs)
// - Nested AND/OR combinations (on multiple fields)
// - No-scoring path using the Count collector (focus on iterator/skip performance)
// - Top-K retrieval (k=10) using the TopDocs collector
//
// Corpus model:
// - Synthetic docs; each token a/b/c is independently included per doc
// - If none of a/b/c are included, emit a neutral filler token to keep doc length similar
//
// Notes:
// - After optimization, when scoring is disabled Tantivy reads doc-only postings
// (IndexRecordOption::Basic), avoiding frequency decoding overhead.
// - This bench isolates boolean iteration speed and intersection/union cost.
// - Use `cargo bench --bench boolean_conjunction` to run.
use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::sort_key::SortByStaticFastValue;
use tantivy::collector::{Collector, Count, TopDocs};
use tantivy::query::{Query, QueryParser};
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
query_parser: QueryParser,
}
/// Build a single index containing both fields (title, body) and
/// return two BenchIndex views:
/// - single_field: QueryParser defaults to only "body"
/// - multi_field: QueryParser defaults to ["title", "body"]
fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (BenchIndex, BenchIndex) {
// Unified schema (two text fields)
let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT);
let f_body = schema_builder.add_text_field("body", TEXT);
let f_score = schema_builder.add_u64_field("score", FAST);
let f_score2 = schema_builder.add_u64_field("score2", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
// Populate: spread each present token 90/10 to body/title
{
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
for _ in 0..num_docs {
let has_a = rng.gen_bool(p_a as f64);
let has_b = rng.gen_bool(p_b as f64);
let has_c = rng.gen_bool(p_c as f64);
let score = rng.gen_range(0u64..100u64);
let score2 = rng.gen_range(0u64..100_000u64);
let mut title_tokens: Vec<&str> = Vec::new();
let mut body_tokens: Vec<&str> = Vec::new();
if has_a {
if rng.gen_bool(0.1) {
title_tokens.push("a");
} else {
body_tokens.push("a");
}
}
if has_b {
if rng.gen_bool(0.1) {
title_tokens.push("b");
} else {
body_tokens.push("b");
}
}
if has_c {
if rng.gen_bool(0.1) {
title_tokens.push("c");
} else {
body_tokens.push("c");
}
}
if title_tokens.is_empty() && body_tokens.is_empty() {
body_tokens.push("z");
}
writer
.add_document(doc!(
f_title=>title_tokens.join(" "),
f_body=>body_tokens.join(" "),
f_score=>score,
f_score2=>score2,
))
.unwrap();
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
// Build two query parsers with different default fields.
let qp_single = QueryParser::for_index(&index, vec![f_body]);
let qp_multi = QueryParser::for_index(&index, vec![f_title, f_body]);
let single_view = BenchIndex {
index: index.clone(),
searcher: searcher.clone(),
query_parser: qp_single,
};
let multi_view = BenchIndex {
index,
searcher,
query_parser: qp_multi,
};
(single_view, multi_view)
}
fn main() {
// Prepare corpora with varying selectivity. Build one index per corpus
// and derive two views (single-field vs multi-field) from it.
let scenarios = vec![
(
"N=1M, p(a)=5%, p(b)=1%, p(c)=15%".to_string(),
1_000_000,
0.05,
0.01,
0.15,
),
(
"N=1M, p(a)=1%, p(b)=1%, p(c)=15%".to_string(),
1_000_000,
0.01,
0.01,
0.15,
),
];
let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"];
let mut runner = BenchRunner::new();
for (label, n, pa, pb, pc) in scenarios {
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
for (view_name, bench_index) in [("single_field", single_view), ("multi_field", multi_view)]
{
// Single-field group: default field is body only
let mut group = runner.new_group();
group.set_name(format!("{}{}", view_name, label));
for query_str in queries {
add_bench_task(&mut group, &bench_index, query_str, Count, "count");
add_bench_task(
&mut group,
&bench_index,
query_str,
TopDocs::with_limit(10).order_by_score(),
"top10",
);
add_bench_task(
&mut group,
&bench_index,
query_str,
TopDocs::with_limit(10).order_by_fast_field::<u64>("score", Order::Asc),
"top10_by_ff",
);
add_bench_task(
&mut group,
&bench_index,
query_str,
TopDocs::with_limit(10).order_by((
SortByStaticFastValue::<u64>::for_field("score"),
SortByStaticFastValue::<u64>::for_field("score2"),
)),
"top10_by_2ff",
);
}
group.run();
}
}
}
fn add_bench_task<C: Collector + 'static>(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
collector: C,
collector_name: &str,
) {
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
let query = bench_index.query_parser.parse_query(query_str).unwrap();
let search_task = SearchTask {
searcher: bench_index.searcher.clone(),
collector,
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct SearchTask<C: Collector> {
searcher: Searcher,
collector: C,
query: Box<dyn Query>,
}
impl<C: Collector> SearchTask<C> {
#[inline(never)]
pub fn run(&self) -> usize {
self.searcher.search(&self.query, &self.collector).unwrap();
1
}
}

View File

@@ -1,288 +0,0 @@
use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Collector, Count, DocSetCollector, TopDocs};
use tantivy::query::{Query, QueryParser};
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
query_parser: QueryParser,
}
fn build_shared_indices(num_docs: usize, p_title_a: f32, distribution: &str) -> BenchIndex {
// Unified schema
let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT);
let f_num_rand = schema_builder.add_u64_field("num_rand", INDEXED);
let f_num_asc = schema_builder.add_u64_field("num_asc", INDEXED);
let f_num_rand_fast = schema_builder.add_u64_field("num_rand_fast", INDEXED | FAST);
let f_num_asc_fast = schema_builder.add_u64_field("num_asc_fast", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
{
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
match distribution {
"dense" => {
for doc_id in 0..num_docs {
// Always add title to avoid empty documents
let title_token = if rng.gen_bool(p_title_a as f64) {
"a"
} else {
"b"
};
let num_rand = rng.gen_range(0u64..1000u64);
let num_asc = (doc_id / 10000) as u64;
writer
.add_document(doc!(
f_title=>title_token,
f_num_rand=>num_rand,
f_num_asc=>num_asc,
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
"sparse" => {
for doc_id in 0..num_docs {
// Always add title to avoid empty documents
let title_token = if rng.gen_bool(p_title_a as f64) {
"a"
} else {
"b"
};
let num_rand = rng.gen_range(0u64..10000000u64);
let num_asc = doc_id as u64;
writer
.add_document(doc!(
f_title=>title_token,
f_num_rand=>num_rand,
f_num_asc=>num_asc,
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
_ => {
panic!("Unsupported distribution type");
}
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
// Build query parser for title field
let qp_title = QueryParser::for_index(&index, vec![f_title]);
BenchIndex {
index,
searcher,
query_parser: qp_title,
}
}
fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
(
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense",
0,
9,
),
(
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense",
990,
999,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
0,
9,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
9_999_990,
9_999_999,
),
];
let mut runner = BenchRunner::new();
for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
// Build index for this scenario
let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);
// Create benchmark group
let mut group = runner.new_group();
// Now set the name (this moves scenario_id)
group.set_name(scenario_id);
// Define all four field types
let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];
// Define the three terms we want to test with
let terms = ["a", "b", "z"];
// Generate all combinations of terms and field names
let mut queries = Vec::new();
for &term in &terms {
for &field_name in &field_names {
let query_str = format!(
"{} AND {}:[{} TO {}]",
term, field_name, range_low, range_high
);
queries.push((query_str, field_name.to_string()));
}
}
let query_str = format!(
"{}:[{} TO {}] AND {}:[{} TO {}]",
"num_rand_fast", range_low, range_high, "num_asc_fast", range_low, range_high
);
queries.push((query_str, "num_asc_fast".to_string()));
// Run all benchmark tasks for each query and its corresponding field name
for (query_str, field_name) in queries {
run_benchmark_tasks(&mut group, &bench_index, &query_str, &field_name);
}
group.run();
}
}
/// Run all benchmark tasks for a given query string and field name
fn run_benchmark_tasks(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
field_name: &str,
) {
// Test count
add_bench_task(bench_group, bench_index, query_str, Count, "count");
// Test all results
add_bench_task(
bench_group,
bench_index,
query_str,
DocSetCollector,
"all results",
);
// Test top 100 by the field (if it's a FAST field)
if field_name.ends_with("_fast") {
// Ascending order
{
let collector_name = format!("top100_by_{}_asc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task(
bench_group,
bench_index,
query_str,
TopDocs::with_limit(100).order_by_fast_field::<u64>(field_name_owned, Order::Asc),
&collector_name,
);
}
// Descending order
{
let collector_name = format!("top100_by_{}_desc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task(
bench_group,
bench_index,
query_str,
TopDocs::with_limit(100).order_by_fast_field::<u64>(field_name_owned, Order::Desc),
&collector_name,
);
}
}
}
fn add_bench_task<C: Collector + 'static>(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
collector: C,
collector_name: &str,
) {
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
let query = bench_index.query_parser.parse_query(query_str).unwrap();
let search_task = SearchTask {
searcher: bench_index.searcher.clone(),
collector,
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct SearchTask<C: Collector> {
searcher: Searcher,
collector: C,
query: Box<dyn Query>,
}
impl<C: Collector> SearchTask<C> {
#[inline(never)]
pub fn run(&self) -> usize {
let result = self.searcher.search(&self.query, &self.collector).unwrap();
if let Some(count) = (&result as &dyn std::any::Any).downcast_ref::<usize>() {
*count
} else if let Some(top_docs) = (&result as &dyn std::any::Any)
.downcast_ref::<Vec<(Option<u64>, tantivy::DocAddress)>>()
{
top_docs.len()
} else if let Some(top_docs) =
(&result as &dyn std::any::Any).downcast_ref::<Vec<(u64, tantivy::DocAddress)>>()
{
top_docs.len()
} else if let Some(doc_set) = (&result as &dyn std::any::Any)
.downcast_ref::<std::collections::HashSet<tantivy::DocAddress>>()
{
doc_set.len()
} else {
eprintln!(
"Unknown collector result type: {:?}",
std::any::type_name::<C::Fruit>()
);
0
}
}
}

View File

@@ -1,69 +0,0 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use serde_json::json;
use tantivy::collector::Count;
use tantivy::query::ExistsQuery;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
let doc_count: usize = 500_000;
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
let indices: Vec<(String, Index)> = subfield_counts
.iter()
.map(|&sub_fields| {
(
format!("subfields={sub_fields}"),
build_index_with_json_subfields(doc_count, sub_fields),
)
})
.collect();
let mut group = InputGroup::new_with_inputs(indices);
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
group.config().num_iter_group = Some(1);
group.config().num_iter_bench = Some(1);
group.register("exists_json", exists_json_union);
group.run();
}
fn exists_json_union(index: &Index) {
let reader = index.reader().expect("reader");
let searcher = reader.searcher();
let query = ExistsQuery::new("json".to_string(), true);
let count = searcher.search(&query, &Count).expect("exists search");
// Prevents optimizer from eliding the search
black_box(count);
}
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
// Schema: single JSON field stored as FAST to support ExistsQuery.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).expect("create index");
{
let mut index_writer = index
.writer_with_num_threads(1, 200_000_000)
.expect("writer");
for i in 0..num_docs {
let sub = i % num_subfields;
// Only one subpath set per document; rotate subpaths so that
// no single subpath is full, but the union covers all docs.
let v = json!({ format!("field_{sub}"): i as u64 });
index_writer
.add_document(doc!(json_field => v))
.expect("add_document");
}
index_writer.commit().expect("commit");
}
index
}

View File

@@ -1,365 +0,0 @@
use std::ops::Bound;
use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Count, DocSetCollector, TopDocs};
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, FAST, INDEXED};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher, Term};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
}
fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
// Schema with fast fields only
let mut schema_builder = Schema::builder();
let f_num_rand_fast = schema_builder.add_u64_field("num_rand_fast", INDEXED | FAST);
let f_num_asc_fast = schema_builder.add_u64_field("num_asc_fast", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
{
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
match distribution {
"dense" => {
for doc_id in 0..num_docs {
let num_rand = rng.gen_range(0u64..1000u64);
let num_asc = (doc_id / 10000) as u64;
writer
.add_document(doc!(
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
"sparse" => {
for doc_id in 0..num_docs {
let num_rand = rng.gen_range(0u64..10000000u64);
let num_asc = doc_id as u64;
writer
.add_document(doc!(
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
_ => {
panic!("Unsupported distribution type");
}
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
BenchIndex { index, searcher }
}
fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
// Dense distribution - random values in small range (0-999)
(
"dense_values_search_low_value_range".to_string(),
10_000_000,
"dense",
0,
9,
),
(
"dense_values_search_high_value_range".to_string(),
10_000_000,
"dense",
990,
999,
),
(
"dense_values_search_out_of_range".to_string(),
10_000_000,
"dense",
1000,
1002,
),
(
"sparse_values_search_low_value_range".to_string(),
10_000_000,
"sparse",
0,
9,
),
(
"sparse_values_search_high_value_range".to_string(),
10_000_000,
"sparse",
9_999_990,
9_999_999,
),
(
"sparse_values_search_out_of_range".to_string(),
10_000_000,
"sparse",
10_000_000,
10_000_002,
),
];
let mut runner = BenchRunner::new();
for (scenario_id, n, num_rand_distribution, range_low, range_high) in scenarios {
// Build index for this scenario
let bench_index = build_shared_indices(n, num_rand_distribution);
// Create benchmark group
let mut group = runner.new_group();
// Now set the name (this moves scenario_id)
group.set_name(scenario_id);
// Define fast field types
let field_names = ["num_rand_fast", "num_asc_fast"];
// Generate range queries for fast fields
for &field_name in &field_names {
// Create the range query
let field = bench_index.searcher.schema().get_field(field_name).unwrap();
let lower_term = Term::from_field_u64(field, range_low);
let upper_term = Term::from_field_u64(field, range_high);
let query = RangeQuery::new(Bound::Included(lower_term), Bound::Included(upper_term));
run_benchmark_tasks(
&mut group,
&bench_index,
query,
field_name,
range_low,
range_high,
);
}
group.run();
}
}
/// Run all benchmark tasks for a given range query and field name
fn run_benchmark_tasks(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
field_name: &str,
range_low: u64,
range_high: u64,
) {
// Test count
add_bench_task_count(
bench_group,
bench_index,
query.clone(),
"count",
field_name,
range_low,
range_high,
);
// Test top 100 by the field (ascending order)
{
let collector_name = format!("top100_by_{}_asc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task_top100_asc(
bench_group,
bench_index,
query.clone(),
&collector_name,
field_name,
range_low,
range_high,
field_name_owned,
);
}
// Test top 100 by the field (descending order)
{
let collector_name = format!("top100_by_{}_desc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task_top100_desc(
bench_group,
bench_index,
query,
&collector_name,
field_name,
range_low,
range_high,
field_name_owned,
);
}
}
fn add_bench_task_count(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = CountSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_docset(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = DocSetSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_top100_asc(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
field_name_owned: String,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = Top100AscSearchTask {
searcher: bench_index.searcher.clone(),
query,
field_name: field_name_owned,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_top100_desc(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
field_name_owned: String,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = Top100DescSearchTask {
searcher: bench_index.searcher.clone(),
query,
field_name: field_name_owned,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct CountSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl CountSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
self.searcher.search(&self.query, &Count).unwrap()
}
}
struct DocSetSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl DocSetSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let result = self.searcher.search(&self.query, &DocSetCollector).unwrap();
result.len()
}
}
struct Top100AscSearchTask {
searcher: Searcher,
query: RangeQuery,
field_name: String,
}
impl Top100AscSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let collector =
TopDocs::with_limit(100).order_by_fast_field::<u64>(&self.field_name, Order::Asc);
let result = self.searcher.search(&self.query, &collector).unwrap();
for (_score, doc_address) in &result {
let _doc: tantivy::TantivyDocument = self.searcher.doc(*doc_address).unwrap();
}
result.len()
}
}
struct Top100DescSearchTask {
searcher: Searcher,
query: RangeQuery,
field_name: String,
}
impl Top100DescSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let collector =
TopDocs::with_limit(100).order_by_fast_field::<u64>(&self.field_name, Order::Desc);
let result = self.searcher.search(&self.query, &collector).unwrap();
for (_score, doc_address) in &result {
let _doc: tantivy::TantivyDocument = self.searcher.doc(*doc_address).unwrap();
}
result.len()
}
}

View File

@@ -1,260 +0,0 @@
use std::fmt::Display;
use std::net::Ipv6Addr;
use std::ops::RangeInclusive;
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use columnar::MonotonicallyMappableToU128;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
bench_range_query();
}
fn bench_range_query() {
let index = get_index_0_to_100();
let mut runner = BenchRunner::new();
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
runner.set_name("range_query on u64");
let field_name_and_descr: Vec<_> = vec![
("id", "Single Valued Range Field"),
("ids", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent()),
("10_percent", get_10_percent()),
("1_percent", get_1_percent()),
];
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
runner.set_name("range_query on ip");
let field_name_and_descr: Vec<_> = vec![
("ip", "Single Valued Range Field"),
("ips", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent_ip()),
("10_percent", get_10_percent_ip()),
("1_percent", get_1_percent_ip()),
];
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
}
fn test_range<T: Display>(
runner: &mut BenchRunner,
index: &Index,
field_name_and_descr: &[(&str, &str)],
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
) {
for (field, suffix) in field_name_and_descr {
let term_num_hits = vec![
("", ""),
("1_percent", "veryfew"),
("10_percent", "few"),
("90_percent", "most"),
];
let mut group = runner.new_group();
group.set_name(suffix);
// all intersect combinations
for (range_name, range) in &range_num_hits {
for (term_name, term) in &term_num_hits {
let index = &index;
let test_name = if term_name.is_empty() {
format!("id_range_hit_{}", range_name)
} else {
format!(
"id_range_hit_{}_intersect_with_term_{}",
range_name, term_name
)
};
group.register(test_name, move |_| {
let query = if term_name.is_empty() {
"".to_string()
} else {
format!("AND id_name:{}", term)
};
black_box(execute_query(field, range, &query, index));
});
}
}
group.run();
}
}
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"most".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
// Multiply by 1000, so that we create most buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_docs(&docs)
}
#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
pub id: u64,
pub ip: Ipv6Addr,
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field =
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default().set_fast().set_indexed(),
);
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default().set_fast().set_indexed(),
);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ids_i64_field => doc.id as i64,
ids_i64_field => doc.id as i64,
ids_f64_field => doc.id as f64,
ids_f64_field => doc.id as f64,
ids_u64_field => doc.id,
ids_u64_field => doc.id,
id_u64_field => doc.id,
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}
fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}
fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}
fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
struct NumHits {
count: usize,
}
impl OutputValue for NumHits {
fn column_title() -> &'static str {
"NumHits"
}
fn format(&self) -> Option<String> {
Some(self.count.to_string())
}
}
fn execute_query<T: Display>(
field: &str,
id_range: &RangeInclusive<T>,
suffix: &str,
index: &Index,
) -> NumHits {
let gen_query_inclusive = |from: &T, to: &T| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(id_range.start(), id_range.end());
execute_query_(&query, index)
}
fn execute_query_(query: &str, index: &Index) -> NumHits {
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let num_hits = searcher
.search(&query, &(TopDocs::with_limit(10).order_by_score(), Count))
.unwrap()
.1;
NumHits { count: num_hits }
}

View File

@@ -1,7 +1,7 @@
[package] [package]
name = "tantivy-bitpacker" name = "tantivy-bitpacker"
version = "0.9.0" version = "0.6.0"
edition = "2024" edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = [] categories = []

View File

@@ -48,7 +48,7 @@ impl BitPacker {
pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> { pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 { if self.mini_buffer_written > 0 {
let num_bytes = self.mini_buffer_written.div_ceil(8); let num_bytes = (self.mini_buffer_written + 7) / 8;
let bytes = self.mini_buffer.to_le_bytes(); let bytes = self.mini_buffer.to_le_bytes();
output.write_all(&bytes[..num_bytes])?; output.write_all(&bytes[..num_bytes])?;
self.mini_buffer_written = 0; self.mini_buffer_written = 0;
@@ -65,7 +65,7 @@ impl BitPacker {
#[derive(Clone, Debug, Default, Copy)] #[derive(Clone, Debug, Default, Copy)]
pub struct BitUnpacker { pub struct BitUnpacker {
num_bits: usize, num_bits: u32,
mask: u64, mask: u64,
} }
@@ -83,7 +83,7 @@ impl BitUnpacker {
(1u64 << num_bits) - 1u64 (1u64 << num_bits) - 1u64
}; };
BitUnpacker { BitUnpacker {
num_bits: usize::from(num_bits), num_bits: u32::from(num_bits),
mask, mask,
} }
} }
@@ -94,14 +94,14 @@ impl BitUnpacker {
#[inline] #[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 { pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
let addr_in_bits = idx as usize * self.num_bits; let addr_in_bits = idx * self.num_bits;
let addr = addr_in_bits >> 3; let addr = (addr_in_bits >> 3) as usize;
if addr + 8 > data.len() { if addr + 8 > data.len() {
if self.num_bits == 0 { if self.num_bits == 0 {
return 0; return 0;
} }
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
return self.get_slow_path(addr, bit_shift as u32, data); return self.get_slow_path(addr, bit_shift, data);
} }
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap(); let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
@@ -134,13 +134,12 @@ impl BitUnpacker {
"Bitwidth must be <= 32 to use this method." "Bitwidth must be <= 32 to use this method."
); );
let end_idx: u32 = start_idx + output.len() as u32; let end_idx = start_idx + output.len() as u32;
// We use `usize` here to avoid overflow issues. let end_bit_read = end_idx * self.num_bits;
let end_bit_read = (end_idx as usize) * self.num_bits; let end_byte_read = (end_bit_read + 7) / 8;
let end_byte_read = end_bit_read.div_ceil(8);
assert!( assert!(
end_byte_read <= data.len(), end_byte_read as usize <= data.len(),
"Requested index is out of bounds." "Requested index is out of bounds."
); );
@@ -160,24 +159,24 @@ impl BitUnpacker {
// We want the start of the fast track to start align with bytes. // We want the start of the fast track to start align with bytes.
// A sufficient condition is to start with an idx that is a multiple of 8, // A sufficient condition is to start with an idx that is a multiple of 8,
// so highway start is the closest multiple of 8 that is >= start_idx. // so highway start is the closest multiple of 8 that is >= start_idx.
let entrance_ramp_len: u32 = 8 - (start_idx % 8) % 8; let entrance_ramp_len = 8 - (start_idx % 8) % 8;
let highway_start: u32 = start_idx + entrance_ramp_len; let highway_start: u32 = start_idx + entrance_ramp_len;
if highway_start + (BitPacker1x::BLOCK_LEN as u32) > end_idx { if highway_start + BitPacker1x::BLOCK_LEN as u32 > end_idx {
// We don't have enough values to have even a single block of highway. // We don't have enough values to have even a single block of highway.
// Let's just supply the values the simple way. // Let's just supply the values the simple way.
get_batch_ramp(start_idx, output); get_batch_ramp(start_idx, output);
return; return;
} }
let num_blocks: usize = (end_idx - highway_start) as usize / BitPacker1x::BLOCK_LEN; let num_blocks: u32 = (end_idx - highway_start) / BitPacker1x::BLOCK_LEN as u32;
// Entrance ramp // Entrance ramp
get_batch_ramp(start_idx, &mut output[..entrance_ramp_len as usize]); get_batch_ramp(start_idx, &mut output[..entrance_ramp_len as usize]);
// Highway // Highway
let mut offset = (highway_start as usize * self.num_bits) / 8; let mut offset = (highway_start * self.num_bits) as usize / 8;
let mut output_cursor = (highway_start - start_idx) as usize; let mut output_cursor = (highway_start - start_idx) as usize;
for _ in 0..num_blocks { for _ in 0..num_blocks {
offset += BitPacker1x.decompress( offset += BitPacker1x.decompress(
@@ -189,7 +188,7 @@ impl BitUnpacker {
} }
// Exit ramp // Exit ramp
let highway_end: u32 = highway_start + (num_blocks * BitPacker1x::BLOCK_LEN) as u32; let highway_end = highway_start + num_blocks * BitPacker1x::BLOCK_LEN as u32;
get_batch_ramp(highway_end, &mut output[output_cursor..]); get_batch_ramp(highway_end, &mut output[output_cursor..]);
} }
@@ -258,7 +257,7 @@ mod test {
bitpacker.write(val, num_bits, &mut data).unwrap(); bitpacker.write(val, num_bits, &mut data).unwrap();
} }
bitpacker.close(&mut data).unwrap(); bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len).div_ceil(8)); assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8);
let bitunpacker = BitUnpacker::new(num_bits); let bitunpacker = BitUnpacker::new(num_bits);
(bitunpacker, vals, data) (bitunpacker, vals, data)
} }
@@ -304,7 +303,7 @@ mod test {
bitpacker.write(val, num_bits, &mut buffer).unwrap(); bitpacker.write(val, num_bits, &mut buffer).unwrap();
} }
bitpacker.flush(&mut buffer).unwrap(); bitpacker.flush(&mut buffer).unwrap();
assert_eq!(buffer.len(), (vals.len() * num_bits as usize).div_ceil(8)); assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8);
let bitunpacker = BitUnpacker::new(num_bits); let bitunpacker = BitUnpacker::new(num_bits);
let max_val = if num_bits == 64 { let max_val = if num_bits == 64 {
u64::MAX u64::MAX

View File

@@ -1,6 +1,6 @@
use super::bitpacker::BitPacker; use super::bitpacker::BitPacker;
use super::compute_num_bits; use super::compute_num_bits;
use crate::{BitUnpacker, minmax}; use crate::{minmax, BitUnpacker};
const BLOCK_SIZE: usize = 128; const BLOCK_SIZE: usize = 128;
@@ -34,7 +34,7 @@ struct BlockedBitpackerEntryMetaData {
impl BlockedBitpackerEntryMetaData { impl BlockedBitpackerEntryMetaData {
fn new(offset: u64, num_bits: u8, base_value: u64) -> Self { fn new(offset: u64, num_bits: u8, base_value: u64) -> Self {
let encoded = offset | (u64::from(num_bits) << (64 - 8)); let encoded = offset | (num_bits as u64) << (64 - 8);
Self { Self {
encoded, encoded,
base_value, base_value,
@@ -140,10 +140,10 @@ impl BlockedBitpacker {
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ { pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
// todo performance: we could decompress a whole block and cache it instead // todo performance: we could decompress a whole block and cache it instead
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE; let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
let iter = (0..bitpacked_elems)
(0..bitpacked_elems)
.map(move |idx| self.get(idx)) .map(move |idx| self.get(idx))
.chain(self.buffer.iter().cloned()) .chain(self.buffer.iter().cloned());
iter
} }
} }

View File

@@ -19,7 +19,7 @@ fn u32_to_i32(val: u32) -> i32 {
#[inline] #[inline]
unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType { unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType {
const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]); const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]);
unsafe { op_xor(vals_u32x8s, HIGHEST_BIT_MASK) } op_xor(vals_u32x8s, HIGHEST_BIT_MASK)
} }
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) { pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
@@ -66,19 +66,17 @@ unsafe fn filter_vec_avx2_aux(
]); ]);
const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]); const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]);
for _ in 0..num_words { for _ in 0..num_words {
unsafe { let word = load_unaligned(input);
let word = load_unaligned(input); let word = u32_to_i32_avx2(word);
let word = u32_to_i32_avx2(word); let keeper_bitset = compute_filter_bitset(word, range_simd.clone());
let keeper_bitset = compute_filter_bitset(word, range_simd.clone()); let added_len = keeper_bitset.count_ones();
let added_len = keeper_bitset.count_ones(); let filtered_doc_ids = compact(ids, keeper_bitset);
let filtered_doc_ids = compact(ids, keeper_bitset); store_unaligned(output_tail as *mut __m256i, filtered_doc_ids);
store_unaligned(output_tail as *mut __m256i, filtered_doc_ids); output_tail = output_tail.offset(added_len as isize);
output_tail = output_tail.offset(added_len as isize); ids = op_add(ids, SHIFT);
ids = op_add(ids, SHIFT); input = input.offset(1);
input = input.offset(1);
}
} }
unsafe { output_tail.offset_from(output) as usize } output_tail.offset_from(output) as usize
} }
#[inline] #[inline]
@@ -94,7 +92,8 @@ unsafe fn compute_filter_bitset(val: __m256i, range: std::ops::RangeInclusive<__
let too_low = op_greater(*range.start(), val); let too_low = op_greater(*range.start(), val);
let too_high = op_greater(val, *range.end()); let too_high = op_greater(val, *range.end());
let inside = op_or(too_low, too_high); let inside = op_or(too_low, too_high);
255 - std::arch::x86_64::_mm256_movemask_ps(_mm256_castsi256_ps(inside)) as u8 255 - std::arch::x86_64::_mm256_movemask_ps(std::mem::transmute::<DataType, __m256>(inside))
as u8
} }
union U8x32 { union U8x32 {

View File

@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar]; const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
impl FilterImplPerInstructionSet { impl FilterImplPerInstructionSet {
#[allow(unused_variables)]
#[inline] #[inline]
#[allow(unused_variables)] // on non-x86_64, code is unused.
fn from(code: u8) -> FilterImplPerInstructionSet { fn from(code: u8) -> FilterImplPerInstructionSet {
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
if code == FilterImplPerInstructionSet::AVX2 as u8 { if code == FilterImplPerInstructionSet::AVX2 as u8 {

View File

@@ -33,7 +33,11 @@ pub use crate::blocked_bitpacker::BlockedBitpacker;
/// number of bits. /// number of bits.
pub fn compute_num_bits(n: u64) -> u8 { pub fn compute_num_bits(n: u64) -> u8 {
let amplitude = (64u32 - n.leading_zeros()) as u8; let amplitude = (64u32 - n.leading_zeros()) as u8;
if amplitude <= 64 - 8 { amplitude } else { 64 } if amplitude <= 64 - 8 {
amplitude
} else {
64
}
} }
/// Computes the (min, max) of an iterator of `PartialOrd` values. /// Computes the (min, max) of an iterator of `PartialOrd` values.

View File

@@ -16,14 +16,14 @@ body = """
{%- if version %} in {{ version }}{%- endif -%} {%- if version %} in {{ version }}{%- endif -%}
{% for commit in commits %} {% for commit in commits %}
{% if commit.remote.pr_title -%} {% if commit.github.pr_title -%}
{%- set commit_message = commit.remote.pr_title -%} {%- set commit_message = commit.github.pr_title -%}
{%- else -%} {%- else -%}
{%- set commit_message = commit.message -%} {%- set commit_message = commit.message -%}
{%- endif -%} {%- endif -%}
- {{ commit_message | split(pat="\n") | first | trim }}\ - {{ commit_message | split(pat="\n") | first | trim }}\
{% if commit.remote.pr_number %} \ {% if commit.github.pr_number %} \
[#{{ commit.remote.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.remote.pr_number }}){% if commit.remote.username %}(@{{ commit.remote.username }}){%- endif -%} \ [#{{ commit.github.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.github.pr_number }}){% if commit.github.username %}(@{{ commit.github.username }}){%- endif -%} \
{%- endif %} {%- endif %}
{%- endfor -%} {%- endfor -%}

View File

@@ -1,7 +1,7 @@
[package] [package]
name = "tantivy-columnar" name = "tantivy-columnar"
version = "0.6.0" version = "0.3.0"
edition = "2024" edition = "2021"
license = "MIT" license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy" homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy" repository = "https://github.com/quickwit-oss/tantivy"
@@ -9,21 +9,21 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"] categories = ["database-implementations", "data-structures", "compression"]
[dependencies] [dependencies]
itertools = "0.14.0" itertools = "0.13.0"
fastdivide = "0.4.0" fastdivide = "0.4.0"
stacker = { version= "0.6", path = "../stacker", package="tantivy-stacker"} stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.6", path = "../sstable", package = "tantivy-sstable" } sstable = { version= "0.3", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.10", path = "../common", package = "tantivy-common" } common = { version= "0.7", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.9", path = "../bitpacker/" } tantivy-bitpacker = { version= "0.6", path = "../bitpacker/" }
serde = "1.0.152" serde = "1.0.152"
downcast-rs = "2.0.1" downcast-rs = "1.2.0"
[dev-dependencies] [dev-dependencies]
proptest = "1" proptest = "1"
more-asserts = "0.3.1" more-asserts = "0.3.1"
rand = "0.8" rand = "0.8"
binggan = "0.14.0" binggan = "0.12.0"
[[bench]] [[bench]]
name = "bench_merge" name = "bench_merge"
@@ -33,29 +33,6 @@ harness = false
name = "bench_access" name = "bench_access"
harness = false harness = false
[[bench]]
name = "bench_first_vals"
harness = false
[[bench]]
name = "bench_values_u64"
harness = false
[[bench]]
name = "bench_values_u128"
harness = false
[[bench]]
name = "bench_create_column_values"
harness = false
[[bench]]
name = "bench_column_values_get"
harness = false
[[bench]]
name = "bench_optional_index"
harness = false
[features] [features]
zstd-compression = ["sstable/zstd-compression"] unstable = []

View File

@@ -73,7 +73,7 @@ The crate introduces the following concepts.
`Columnar` is an equivalent of a dataframe. `Columnar` is an equivalent of a dataframe.
It maps `column_key` to `Column`. It maps `column_key` to `Column`.
A `Column<T>` associates a `RowId` (u32) to any A `Column<T>` asssociates a `RowId` (u32) to any
number of values. number of values.
This is made possible by wrapping a `ColumnIndex` and a `ColumnValue` object. This is made possible by wrapping a `ColumnIndex` and a `ColumnValue` object.

View File

@@ -1,4 +1,4 @@
use binggan::{InputGroup, black_box}; use binggan::{black_box, InputGroup};
use common::*; use common::*;
use tantivy_columnar::Column; use tantivy_columnar::Column;
@@ -19,7 +19,7 @@ fn main() {
let mut add_card = |card1: Card| { let mut add_card = |card1: Card| {
inputs.push(( inputs.push((
card1.to_string(), format!("{card1}"),
generate_columnar_and_open(card1, NUM_DOCS), generate_columnar_and_open(card1, NUM_DOCS),
)); ));
}; };
@@ -42,6 +42,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
} }
} }
black_box(sum); black_box(sum);
None
}); });
runner.register("access_first_vals", |column| { runner.register("access_first_vals", |column| {
let mut sum = 0; let mut sum = 0;
@@ -50,7 +51,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
let mut buffer = vec![None; BLOCK_SIZE]; let mut buffer = vec![None; BLOCK_SIZE];
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) { for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
// fill docs // fill docs
#[allow(clippy::needless_range_loop)]
for idx in 0..BLOCK_SIZE { for idx in 0..BLOCK_SIZE {
docs[idx] = idx as u32 + i; docs[idx] = idx as u32 + i;
} }
@@ -63,6 +63,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
} }
black_box(sum); black_box(sum);
None
}); });
runner.run(); runner.run();
} }

View File

@@ -1,61 +0,0 @@
use std::sync::Arc;
use binggan::{InputGroup, black_box};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy_columnar::ColumnValues;
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55_000_u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
#[inline(never)]
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
type Col = Arc<dyn ColumnValues<u64>>;
fn main() {
let data = get_data();
let inputs: Vec<(String, Col)> = vec![
(
"bitpacked".to_string(),
serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Bitpacked]),
),
(
"linear".to_string(),
serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Linear]),
),
(
"blockwise_linear".to_string(),
serialize_and_load_u64_based_column_values(
&data.as_slice(),
&[CodecType::BlockwiseLinear],
),
),
];
let mut group: InputGroup<Col> = InputGroup::new_with_inputs(inputs);
group.register("fastfield_get", |col: &Col| {
let mut sum = 0u64;
for pos in value_iter() {
sum = sum.wrapping_add(col.get_val(pos as u32));
}
black_box(sum);
});
group.run();
}

View File

@@ -1,44 +0,0 @@
use binggan::{InputGroup, black_box};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy_columnar::column_values::{CodecType, serialize_u64_based_column_values};
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55_000_u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn main() {
let data = get_data();
let mut group: InputGroup<(CodecType, Vec<u64>)> = InputGroup::new_with_inputs(vec![
(
"bitpacked codec".to_string(),
(CodecType::Bitpacked, data.clone()),
),
(
"linear codec".to_string(),
(CodecType::Linear, data.clone()),
),
(
"blockwise linear codec".to_string(),
(CodecType::BlockwiseLinear, data.clone()),
),
]);
group.register("serialize column_values", |data| {
let mut buffer = Vec::new();
serialize_u64_based_column_values(&data.1.as_slice(), &[data.0], &mut buffer).unwrap();
black_box(buffer.len());
});
group.run();
}

View File

@@ -1,9 +1,12 @@
#![feature(test)]
extern crate test;
use std::sync::Arc; use std::sync::Arc;
use binggan::{InputGroup, black_box};
use rand::prelude::*; use rand::prelude::*;
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values}; use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
use tantivy_columnar::*; use tantivy_columnar::*;
use test::{black_box, Bencher};
struct Columns { struct Columns {
pub optional: Column, pub optional: Column,
@@ -65,38 +68,88 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
serialize_and_load_u64_based_column_values(&column, &[codec_type]) serialize_and_load_u64_based_column_values(&column, &[codec_type])
} }
fn main() { fn run_bench_on_column_full_scan(b: &mut Bencher, column: Column) {
let Columns { let num_iter = black_box(NUM_VALUES);
optional, b.iter(|| {
full,
multi,
} = get_test_columns();
let inputs = vec![
("full".to_string(), full),
("optional".to_string(), optional),
("multi".to_string(), multi),
];
let mut group = InputGroup::new_with_inputs(inputs);
group.register("first_full_scan", |column| {
let mut sum = 0u64; let mut sum = 0u64;
for i in 0..NUM_VALUES as u32 { for i in 0..num_iter as u32 {
let val = column.first(i); let val = column.first(i);
sum += val.unwrap_or(0); sum += val.unwrap_or(0);
} }
black_box(sum); sum
}); });
}
group.register("first_block_single_calls", |column| { fn run_bench_on_column_block_fetch(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64]; let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>(); let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
column.first_vals(&fetch_docids, &mut block);
block[0]
});
}
fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
for i in 0..fetch_docids.len() { for i in 0..fetch_docids.len() {
block[i] = column.first(fetch_docids[i]); block[i] = column.first(fetch_docids[i]);
} }
black_box(block[0]); block[0]
}); });
}
group.run();
/// Column first method
#[bench]
fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_full_scan(b, column);
}
/// Block fetch column accessor
#[bench]
fn bench_get_block_first_on_optional_column(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_single_calls(b, column);
} }

View File

@@ -1,7 +1,7 @@
pub mod common; pub mod common;
use binggan::BenchRunner; use binggan::{black_box, BenchRunner};
use common::{Card, generate_columnar_with_name}; use common::{generate_columnar_with_name, Card};
use tantivy_columnar::*; use tantivy_columnar::*;
const NUM_DOCS: u32 = 100_000; const NUM_DOCS: u32 = 100_000;
@@ -29,7 +29,7 @@ fn main() {
add_combo(Card::Multi, Card::Dense); add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse); add_combo(Card::Multi, Card::Sparse);
let mut runner: BenchRunner = BenchRunner::new(); let runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group(); let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() { for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input( group.register_with_input(

View File

@@ -1,106 +0,0 @@
use binggan::{InputGroup, black_box};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy_columnar::column_index::{OptionalIndex, Set};
const TOTAL_NUM_VALUES: u32 = 1_000_000;
fn gen_optional_index(fill_ratio: f64) -> OptionalIndex {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let vals: Vec<u32> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.map(|(pos, _)| pos as u32)
.collect();
OptionalIndex::for_test(TOTAL_NUM_VALUES, &vals)
}
fn random_range_iterator(
start: u32,
end: u32,
avg_step_size: u32,
avg_deviation: u32,
) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start;
std::iter::from_fn(move || {
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
if current >= end { None } else { Some(current) }
})
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)
}
fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
walk_over_data_from_positions(
codec,
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
)
}
fn walk_over_data_from_positions(
codec: &OptionalIndex,
positions: impl Iterator<Item = u32>,
) -> Option<u32> {
let mut dense_idx: Option<u32> = None;
for idx in positions {
dense_idx = dense_idx.or(codec.rank_if_exists(idx));
}
dense_idx
}
fn main() {
// Build separate inputs for each fill ratio.
let inputs: Vec<(String, OptionalIndex)> = vec![
("fill=1%".to_string(), gen_optional_index(0.01)),
("fill=5%".to_string(), gen_optional_index(0.05)),
("fill=10%".to_string(), gen_optional_index(0.10)),
("fill=50%".to_string(), gen_optional_index(0.50)),
("fill=90%".to_string(), gen_optional_index(0.90)),
];
let mut group: InputGroup<OptionalIndex> = InputGroup::new_with_inputs(inputs);
// Translate orig->codec (rank_if_exists) with sampling
group.register("orig_to_codec_10pct_hit", |codec: &OptionalIndex| {
black_box(walk_over_data(codec, 100));
});
group.register("orig_to_codec_1pct_hit", |codec: &OptionalIndex| {
black_box(walk_over_data(codec, 1000));
});
group.register("orig_to_codec_full_scan", |codec: &OptionalIndex| {
black_box(walk_over_data_from_positions(codec, 0..TOTAL_NUM_VALUES));
});
// Translate codec->orig (select/select_batch) on sampled ranks
fn bench_translate_codec_to_orig_util(codec: &OptionalIndex, percent_hit: f32) {
let num_non_nulls = codec.num_non_nulls();
let idxs: Vec<u32> = if percent_hit == 100.0f32 {
(0..num_non_nulls).collect()
} else {
n_percent_step_iterator(percent_hit, num_non_nulls).collect()
};
let mut output = vec![0u32; idxs.len()];
output.copy_from_slice(&idxs[..]);
codec.select_batch(&mut output);
black_box(output);
}
group.register("codec_to_orig_0.005pct_hit", |codec: &OptionalIndex| {
bench_translate_codec_to_orig_util(codec, 0.005);
});
group.register("codec_to_orig_10pct_hit", |codec: &OptionalIndex| {
bench_translate_codec_to_orig_util(codec, 10.0);
});
group.register("codec_to_orig_full_scan", |codec: &OptionalIndex| {
bench_translate_codec_to_orig_util(codec, 100.0);
});
group.run();
}

View File

@@ -1,12 +1,15 @@
#![feature(test)]
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use std::sync::Arc; use std::sync::Arc;
use binggan::{InputGroup, black_box};
use common::OwnedBytes; use common::OwnedBytes;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::seq::SliceRandom; use rand::seq::SliceRandom;
use rand::{Rng, SeedableRng, random}; use rand::{random, Rng, SeedableRng};
use tantivy_columnar::ColumnValues; use tantivy_columnar::ColumnValues;
use test::Bencher;
extern crate test;
// TODO does this make sense for IPv6 ? // TODO does this make sense for IPv6 ?
fn generate_random() -> Vec<u64> { fn generate_random() -> Vec<u64> {
@@ -44,77 +47,78 @@ fn get_data_50percent_item() -> Vec<u128> {
} }
data.push(SINGLE_ITEM); data.push(SINGLE_ITEM);
data.shuffle(&mut rng); data.shuffle(&mut rng);
data.iter().map(|el| *el as u128).collect::<Vec<_>>() let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
data
} }
fn main() { #[bench]
fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item(); let data = get_data_50percent_item();
let column_range = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
let column_random = get_u128_column_random();
struct Inputs { b.iter(|| {
data: Vec<u128>,
column_range: Arc<dyn ColumnValues<u128>>,
column_random: Arc<dyn ColumnValues<u128>>,
}
let inputs = Inputs {
data,
column_range,
column_random,
};
let mut group: InputGroup<Inputs> =
InputGroup::new_with_inputs(vec![("u128 benches".to_string(), inputs)]);
group.register(
"intfastfield_getrange_u128_50percent_hit",
|inp: &Inputs| {
let mut positions = Vec::new();
inp.column_range.get_row_ids_for_value_range(
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
0..inp.data.len() as u32,
&mut positions,
);
black_box(positions.len());
},
);
group.register("intfastfield_getrange_u128_single_hit", |inp: &Inputs| {
let mut positions = Vec::new(); let mut positions = Vec::new();
inp.column_range.get_row_ids_for_value_range( column.get_row_ids_for_value_range(
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128, *SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
0..inp.data.len() as u32, 0..data.len() as u32,
&mut positions, &mut positions,
); );
black_box(positions.len()); positions
}); });
}
group.register("intfastfield_getrange_u128_hit_all", |inp: &Inputs| { #[bench]
fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
let data = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new(); let mut positions = Vec::new();
inp.column_range.get_row_ids_for_value_range( column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
0..=u128::MAX, positions
0..inp.data.len() as u32,
&mut positions,
);
black_box(positions.len());
}); });
}
// U128 RANGE END
group.register("intfastfield_scan_all_fflookup_u128", |inp: &Inputs| { #[bench]
fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {
let column = get_u128_column_random();
b.iter(|| {
let mut a = 0u128; let mut a = 0u128;
for i in 0u64..inp.column_random.num_vals() as u64 { for i in 0u64..column.num_vals() as u64 {
a += inp.column_random.get_val(i as u32); a += column.get_val(i as u32);
} }
black_box(a); a
}); });
}
group.register("intfastfield_jumpy_stride5_u128", |inp: &Inputs| { #[bench]
let n = inp.column_random.num_vals(); fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) {
let column = get_u128_column_random();
b.iter(|| {
let n = column.num_vals();
let mut a = 0u128; let mut a = 0u128;
for i in (0..n / 5).map(|val| val * 5) { for i in (0..n / 5).map(|val| val * 5) {
a += inp.column_random.get_val(i); a += column.get_val(i);
} }
black_box(a); a
}); });
group.run();
} }

View File

@@ -1,10 +1,13 @@
#![feature(test)]
extern crate test;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use std::sync::Arc; use std::sync::Arc;
use binggan::{InputGroup, black_box};
use rand::prelude::*; use rand::prelude::*;
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values}; use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
use tantivy_columnar::*; use tantivy_columnar::*;
use test::Bencher;
// Warning: this generates the same permutation at each call // Warning: this generates the same permutation at each call
fn generate_permutation() -> Vec<u64> { fn generate_permutation() -> Vec<u64> {
@@ -24,11 +27,37 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
serialize_and_load_u64_based_column_values(&column, &[codec_type]) serialize_and_load_u64_based_column_values(&column, &[codec_type])
} }
#[bench]
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_jumpy_fflookup_bitpacked(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = column.get_val(a as u32);
}
a
});
}
const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50; const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
const SINGLE_ITEM: u64 = 90; const SINGLE_ITEM: u64 = 90;
const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90; const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49; const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
fn get_data_50percent_item() -> Vec<u128> { fn get_data_50percent_item() -> Vec<u128> {
let mut rng = StdRng::from_seed([1u8; 32]); let mut rng = StdRng::from_seed([1u8; 32]);
@@ -40,122 +69,135 @@ fn get_data_50percent_item() -> Vec<u128> {
data.push(SINGLE_ITEM); data.push(SINGLE_ITEM);
data.shuffle(&mut rng); data.shuffle(&mut rng);
data.iter().map(|el| *el as u128).collect::<Vec<_>>() let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
data
} }
type VecCol = (Vec<u64>, Arc<dyn ColumnValues<u64>>); // U64 RANGE START
#[bench]
fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(
FIFTY_PERCENT_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
fn bench_access() { #[bench]
fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(
ONE_PERCENT_ITEM_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
positions
});
}
// U64 RANGE END
#[bench]
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
let permutation = generate_permutation(); let permutation = generate_permutation();
let column_perm: Arc<dyn ColumnValues<u64>> = let n = permutation.len();
serialize_and_load(&permutation, CodecType::Bitpacked); b.iter(|| {
let permutation_gcd = generate_permutation_gcd();
let column_perm_gcd: Arc<dyn ColumnValues<u64>> =
serialize_and_load(&permutation_gcd, CodecType::Bitpacked);
let mut group: InputGroup<VecCol> = InputGroup::new_with_inputs(vec![
(
"access".to_string(),
(permutation.clone(), column_perm.clone()),
),
(
"access_gcd".to_string(),
(permutation_gcd.clone(), column_perm_gcd.clone()),
),
]);
group.register("stride7_vec", |inp: &VecCol| {
let n = inp.0.len();
let mut a = 0u64; let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) { for i in (0..n / 7).map(|val| val * 7) {
a += inp.0[i]; a += permutation[i as usize];
} }
black_box(a); a
}); });
}
group.register("fullscan_vec", |inp: &VecCol| { #[bench]
let mut a = 0u64; fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
for i in 0..inp.0.len() { let permutation = generate_permutation();
a += inp.0[i]; let n = permutation.len();
} let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
black_box(a); b.iter(|| {
}); let mut a = 0;
group.register("stride7_column_values", |inp: &VecCol| {
let n = inp.1.num_vals() as usize;
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) { for i in (0..n / 7).map(|val| val * 7) {
a += inp.1.get_val(i as u32); a += column.get_val(i as u32);
} }
black_box(a); a
}); });
}
group.register("fullscan_column_values", |inp: &VecCol| { #[bench]
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
let column_ref = column.as_ref();
b.iter(|| {
let mut a = 0u64;
for i in 0u32..n as u32 {
a += column_ref.get_val(i);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
let permutation = generate_permutation_gcd();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
let n = inp.1.num_vals() as usize;
for i in 0..n { for i in 0..n {
a += inp.1.get_val(i as u32); a += column.get_val(i as u32);
} }
black_box(a); a
}); });
group.run();
} }
fn bench_range() { #[bench]
let data_50 = get_data_50percent_item(); fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
let data_u64 = data_50.iter().map(|el| *el as u64).collect::<Vec<_>>(); let permutation = generate_permutation();
let column_data: Arc<dyn ColumnValues<u64>> = b.iter(|| {
serialize_and_load(&data_u64, CodecType::Bitpacked); let mut a = 0u64;
for i in 0..permutation.len() {
let mut group: InputGroup<Arc<dyn ColumnValues<u64>>> = a += permutation[i as usize] as u64;
InputGroup::new_with_inputs(vec![("dist_50pct_item".to_string(), column_data.clone())]); }
a
group.register( });
"fastfield_getrange_u64_50percent_hit",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(FIFTY_PERCENT_RANGE, 0..col.num_vals(), &mut positions);
black_box(positions.len());
},
);
group.register(
"fastfield_getrange_u64_1percent_hit",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(
ONE_PERCENT_ITEM_RANGE,
0..col.num_vals(),
&mut positions,
);
black_box(positions.len());
},
);
group.register(
"fastfield_getrange_u64_single_hit",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..col.num_vals(), &mut positions);
black_box(positions.len());
},
);
group.register(
"fastfield_getrange_u64_hit_all",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(0..=u64::MAX, 0..col.num_vals(), &mut positions);
black_box(positions.len());
},
);
group.run();
}
fn main() {
bench_access();
bench_range();
} }

View File

@@ -1,18 +0,0 @@
[package]
name = "tantivy-columnar-inspect"
version = "0.1.0"
edition = "2021"
license = "MIT"
[dependencies]
tantivy = {path="../..", package="tantivy"}
columnar = {path="../", package="tantivy-columnar"}
common = {path="../../common", package="tantivy-common"}
[workspace]
members = []
[profile.release]
debug = true
#debug-assertions = true
#overflow-checks = true

View File

@@ -1,54 +0,0 @@
use columnar::ColumnarReader;
use common::file_slice::{FileSlice, WrapFile};
use std::io;
use std::path::Path;
use tantivy::directory::footer::Footer;
fn main() -> io::Result<()> {
println!("Opens a columnar file written by tantivy and validates it.");
let path = std::env::args().nth(1).unwrap();
let path = Path::new(&path);
println!("Reading {:?}", path);
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
Ok(())
}
pub fn validate_columnar_reader(reader: &ColumnarReader) {
let num_rows = reader.num_rows();
println!("num_rows: {}", num_rows);
let columns = reader.list_columns().unwrap();
println!("num columns: {:?}", columns.len());
for (col_name, dynamic_column_handle) in columns {
let col = dynamic_column_handle.open().unwrap();
match col {
columnar::DynamicColumn::Bool(_)
| columnar::DynamicColumn::I64(_)
| columnar::DynamicColumn::U64(_)
| columnar::DynamicColumn::F64(_)
| columnar::DynamicColumn::IpAddr(_)
| columnar::DynamicColumn::DateTime(_)
| columnar::DynamicColumn::Bytes(_) => {}
columnar::DynamicColumn::Str(str_column) => {
let num_vals = str_column.ords().values.num_vals();
let num_terms_dict = str_column.num_terms() as u64;
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
for ord in str_column.ords().values.iter() {
assert!(ord < num_terms_dict);
}
}
}
}
}
/// Opens a columnar file that was written by tantivy and validates it.
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
let reader = ColumnarReader::open(slice).unwrap();
validate_columnar_reader(&reader);
Ok(reader)
}

View File

@@ -29,20 +29,12 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
} }
} }
#[inline] #[inline]
pub fn fetch_block_with_missing( pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
&mut self,
docs: &[u32],
accessor: &Column<T>,
missing: Option<T>,
) {
self.fetch_block(docs, accessor); self.fetch_block(docs, accessor);
// no missing values // no missing values
if accessor.index.get_cardinality().is_full() { if accessor.index.get_cardinality().is_full() {
return; return;
} }
let Some(missing) = missing else {
return;
};
// We can compare docid_cache length with docs to find missing docs // We can compare docid_cache length with docs to find missing docs
// For multi value columns we can't rely on the length and always need to scan // For multi value columns we can't rely on the length and always need to scan
@@ -74,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
&'a self, &'a self,
docs: &'a [u32], docs: &'a [u32],
accessor: &Column<T>, accessor: &Column<T>,
) -> impl Iterator<Item = (DocId, T)> + 'a + use<'a, T> { ) -> impl Iterator<Item = (DocId, T)> + '_ {
if accessor.index.get_cardinality().is_full() { if accessor.index.get_cardinality().is_full() {
docs.iter().cloned().zip(self.val_cache.iter().cloned()) docs.iter().cloned().zip(self.val_cache.iter().cloned())
} else { } else {
@@ -147,7 +139,7 @@ mod tests {
missing_docs.push(missing_doc); missing_docs.push(missing_doc);
}); });
assert_eq!(missing_docs, Vec::<u32>::new()); assert_eq!(missing_docs, vec![]);
} }
#[test] #[test]

View File

@@ -4,8 +4,8 @@ use std::{fmt, io};
use sstable::{Dictionary, VoidSSTable}; use sstable::{Dictionary, VoidSSTable};
use crate::RowId;
use crate::column::Column; use crate::column::Column;
use crate::RowId;
/// Dictionary encoded column. /// Dictionary encoded column.
/// ///

View File

@@ -9,14 +9,13 @@ use std::sync::Arc;
use common::BinarySerializable; use common::BinarySerializable;
pub use dictionary_encoded::{BytesColumn, StrColumn}; pub use dictionary_encoded::{BytesColumn, StrColumn};
pub use serialize::{ pub use serialize::{
open_column_bytes, open_column_str, open_column_u64, open_column_u128, open_column_bytes, open_column_str, open_column_u128, open_column_u128_as_compact_u64,
open_column_u128_as_compact_u64, serialize_column_mappable_to_u64, open_column_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64,
serialize_column_mappable_to_u128,
}; };
use crate::column_index::{ColumnIndex, Set}; use crate::column_index::{ColumnIndex, Set};
use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal; use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
use crate::column_values::{ColumnValues, monotonic_map_column}; use crate::column_values::{monotonic_map_column, ColumnValues};
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId}; use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
#[derive(Clone)] #[derive(Clone)]
@@ -85,8 +84,8 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
#[inline] #[inline]
pub fn first(&self, doc_id: DocId) -> Option<T> { pub fn first(&self, row_id: RowId) -> Option<T> {
self.values_for_doc(doc_id).next() self.values_for_doc(row_id).next()
} }
/// Load the first value for each docid in the provided slice. /// Load the first value for each docid in the provided slice.
@@ -114,7 +113,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
} }
/// Translates a block of docids to row_ids. /// Translates a block of docis to row_ids.
/// ///
/// returns the row_ids and the matching docids on the same index /// returns the row_ids and the matching docids on the same index
/// e.g. /// e.g.
@@ -131,8 +130,6 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids) self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids)
} }
/// Get an iterator over the values for the provided docid.
#[inline]
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ { pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
self.index self.index
.value_row_ids(doc_id) .value_row_ids(doc_id)
@@ -160,6 +157,15 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.select_batch_in_place(selected_docid_range.start, doc_ids); .select_batch_in_place(selected_docid_range.start, doc_ids);
} }
/// Fills the output vector with the (possibly multiple values that are associated_with
/// `row_id`.
///
/// This method clears the `output` vector.
pub fn fill_vals(&self, row_id: RowId, output: &mut Vec<T>) {
output.clear();
output.extend(self.values_for_doc(row_id));
}
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> { pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
Arc::new(FirstValueWithDefault { Arc::new(FirstValueWithDefault {
column: self, column: self,

View File

@@ -6,10 +6,10 @@ use common::OwnedBytes;
use sstable::Dictionary; use sstable::Dictionary;
use crate::column::{BytesColumn, Column}; use crate::column::{BytesColumn, Column};
use crate::column_index::{SerializableColumnIndex, serialize_column_index}; use crate::column_index::{serialize_column_index, SerializableColumnIndex};
use crate::column_values::{ use crate::column_values::{
CodecType, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
load_u64_based_column_values, serialize_column_values_u128, serialize_u64_based_column_values, load_u64_based_column_values, serialize_column_values_u128, serialize_u64_based_column_values,
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
}; };
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{StrColumn, Version}; use crate::{StrColumn, Version};

View File

@@ -99,9 +99,9 @@ mod tests {
use crate::column_index::merge::detect_cardinality; use crate::column_index::merge::detect_cardinality;
use crate::column_index::multivalued_index::{ use crate::column_index::multivalued_index::{
MultiValueIndex, open_multivalued_index, serialize_multivalued_index, open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
}; };
use crate::column_index::{OptionalIndex, SerializableColumnIndex, merge_column_index}; use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
use crate::{ use crate::{
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder, Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
}; };

View File

@@ -58,7 +58,7 @@ struct ShuffledIndex<'a> {
merge_order: &'a ShuffleMergeOrder, merge_order: &'a ShuffleMergeOrder,
} }
impl Iterable<u32> for ShuffledIndex<'_> { impl<'a> Iterable<u32> for ShuffledIndex<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new( Box::new(
self.merge_order self.merge_order
@@ -127,7 +127,7 @@ fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item
) )
} }
impl Iterable<u32> for ShuffledMultivaluedIndex<'_> { impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order); let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order);
Box::new(integrate_num_vals(num_vals_per_row)) Box::new(integrate_num_vals(num_vals_per_row))
@@ -137,8 +137,8 @@ impl Iterable<u32> for ShuffledMultivaluedIndex<'_> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::RowAddr;
use crate::column_index::OptionalIndex; use crate::column_index::OptionalIndex;
use crate::RowAddr;
#[test] #[test]
fn test_integrate_num_vals_empty() { fn test_integrate_num_vals_empty() {

View File

@@ -1,8 +1,8 @@
use std::ops::Range; use std::ops::Range;
use crate::column_index::SerializableColumnIndex;
use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex}; use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
use crate::column_index::serialize::SerializableOptionalIndex; use crate::column_index::serialize::SerializableOptionalIndex;
use crate::column_index::SerializableColumnIndex;
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder}; use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
ColumnIndex::Full => Box::new(doc_range), ColumnIndex::Full => Box::new(doc_range),
ColumnIndex::Optional(optional_index) => Box::new( ColumnIndex::Optional(optional_index) => Box::new(
optional_index optional_index
.iter_non_null_docs() .iter_rows()
.map(move |row| row + doc_range.start), .map(move |row| row + doc_range.start),
), ),
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index { ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new( MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
multivalued_index multivalued_index
.optional_index .optional_index
.iter_non_null_docs() .iter_rows()
.map(move |row| row + doc_range.start), .map(move |row| row + doc_range.start),
), ),
}, },
@@ -105,11 +105,10 @@ fn get_num_values_iterator<'a>(
) -> Box<dyn Iterator<Item = u32> + 'a> { ) -> Box<dyn Iterator<Item = u32> + 'a> {
match column_index { match column_index {
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()), ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
ColumnIndex::Full => Box::new(std::iter::repeat_n(1u32, num_docs as usize)), ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
ColumnIndex::Optional(optional_index) => Box::new(std::iter::repeat_n( ColumnIndex::Optional(optional_index) => {
1u32, Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
optional_index.num_non_nulls() as usize, }
)),
ColumnIndex::Multivalued(multivalued_index) => Box::new( ColumnIndex::Multivalued(multivalued_index) => Box::new(
multivalued_index multivalued_index
.get_start_index_column() .get_start_index_column()
@@ -124,7 +123,7 @@ fn get_num_values_iterator<'a>(
} }
} }
impl Iterable<u32> for StackedStartOffsets<'_> { impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| { let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32; let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
@@ -178,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
ColumnIndex::Full => Box::new(columnar_row_range), ColumnIndex::Full => Box::new(columnar_row_range),
ColumnIndex::Optional(optional_index) => Box::new( ColumnIndex::Optional(optional_index) => Box::new(
optional_index optional_index
.iter_non_null_docs() .iter_rows()
.map(move |row_id: RowId| columnar_row_range.start + row_id), .map(move |row_id: RowId| columnar_row_range.start + row_id),
), ),
ColumnIndex::Multivalued(_) => { ColumnIndex::Multivalued(_) => {

View File

@@ -14,7 +14,7 @@ pub use merge::merge_column_index;
pub(crate) use multivalued_index::SerializableMultivalueIndex; pub(crate) use multivalued_index::SerializableMultivalueIndex;
pub use optional_index::{OptionalIndex, Set}; pub use optional_index::{OptionalIndex, Set};
pub use serialize::{ pub use serialize::{
SerializableColumnIndex, SerializableOptionalIndex, open_column_index, serialize_column_index, open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
}; };
use crate::column_index::multivalued_index::MultiValueIndex; use crate::column_index::multivalued_index::MultiValueIndex;

View File

@@ -8,7 +8,7 @@ use common::{CountingWriter, OwnedBytes};
use super::optional_index::{open_optional_index, serialize_optional_index}; use super::optional_index::{open_optional_index, serialize_optional_index};
use super::{OptionalIndex, SerializableOptionalIndex, Set}; use super::{OptionalIndex, SerializableOptionalIndex, Set};
use crate::column_values::{ use crate::column_values::{
CodecType, ColumnValues, load_u64_based_column_values, serialize_u64_based_column_values, load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
}; };
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{DocId, RowId, Version}; use crate::{DocId, RowId, Version};
@@ -215,32 +215,6 @@ impl MultiValueIndex {
} }
} }
/// Returns an iterator over document ids that have at least one value.
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => {
let mut doc: DocId = 0u32;
let num_docs = idx.num_docs();
Box::new(std::iter::from_fn(move || {
// This is not the most efficient way to do this, but it's legacy code.
while doc < num_docs {
let cur = doc;
doc += 1;
let start = idx.start_index_column.get_val(cur);
let end = idx.start_index_column.get_val(cur + 1);
if end > start {
return Some(cur);
}
}
None
}))
}
MultiValueIndex::MultiValueIndexV2(idx) => {
Box::new(idx.optional_index.iter_non_null_docs())
}
}
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of /// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// docids. Positions are converted inplace to docids. /// docids. Positions are converted inplace to docids.
/// ///

View File

@@ -1,4 +1,4 @@
use std::io; use std::io::{self, Write};
use std::sync::Arc; use std::sync::Arc;
mod set; mod set;
@@ -7,11 +7,11 @@ mod set_block;
use common::{BinarySerializable, OwnedBytes, VInt}; use common::{BinarySerializable, OwnedBytes, VInt};
pub use set::{SelectCursor, Set, SetCodec}; pub use set::{SelectCursor, Set, SetCodec};
use set_block::{ use set_block::{
DENSE_BLOCK_NUM_BYTES, DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec, DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
}; };
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{DocId, RowId}; use crate::{DocId, InvalidData, RowId};
/// The threshold for for number of elements after which we switch to dense block encoding. /// The threshold for for number of elements after which we switch to dense block encoding.
/// ///
@@ -80,23 +80,23 @@ impl BlockVariant {
/// index is the block index. For each block `byte_start` and `offset` is computed. /// index is the block index. For each block `byte_start` and `offset` is computed.
#[derive(Clone)] #[derive(Clone)]
pub struct OptionalIndex { pub struct OptionalIndex {
num_docs: RowId, num_rows: RowId,
num_non_null_docs: RowId, num_non_null_rows: RowId,
block_data: OwnedBytes, block_data: OwnedBytes,
block_metas: Arc<[BlockMeta]>, block_metas: Arc<[BlockMeta]>,
} }
impl Iterable<u32> for &OptionalIndex { impl<'a> Iterable<u32> for &'a OptionalIndex {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.iter_non_null_docs()) Box::new(self.iter_rows())
} }
} }
impl std::fmt::Debug for OptionalIndex { impl std::fmt::Debug for OptionalIndex {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("OptionalIndex") f.debug_struct("OptionalIndex")
.field("num_docs", &self.num_docs) .field("num_rows", &self.num_rows)
.field("num_non_null_docs", &self.num_non_null_docs) .field("num_non_null_rows", &self.num_non_null_rows)
.finish_non_exhaustive() .finish_non_exhaustive()
} }
} }
@@ -123,7 +123,7 @@ enum BlockSelectCursor<'a> {
Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>), Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>),
} }
impl BlockSelectCursor<'_> { impl<'a> BlockSelectCursor<'a> {
fn select(&mut self, rank: u16) -> u16 { fn select(&mut self, rank: u16) -> u16 {
match self { match self {
BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank), BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank),
@@ -141,7 +141,7 @@ pub struct OptionalIndexSelectCursor<'a> {
num_null_rows_before_block: RowId, num_null_rows_before_block: RowId,
} }
impl OptionalIndexSelectCursor<'_> { impl<'a> OptionalIndexSelectCursor<'a> {
fn search_and_load_block(&mut self, rank: RowId) { fn search_and_load_block(&mut self, rank: RowId) {
if rank < self.current_block_end_rank { if rank < self.current_block_end_rank {
// we are already in the right block // we are already in the right block
@@ -165,7 +165,7 @@ impl OptionalIndexSelectCursor<'_> {
} }
} }
impl SelectCursor<RowId> for OptionalIndexSelectCursor<'_> { impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
fn select(&mut self, rank: RowId) -> RowId { fn select(&mut self, rank: RowId) -> RowId {
self.search_and_load_block(rank); self.search_and_load_block(rank);
let index_in_block = (rank - self.num_null_rows_before_block) as u16; let index_in_block = (rank - self.num_null_rows_before_block) as u16;
@@ -259,13 +259,11 @@ impl Set<RowId> for OptionalIndex {
impl OptionalIndex { impl OptionalIndex {
pub fn for_test(num_rows: RowId, row_ids: &[RowId]) -> OptionalIndex { pub fn for_test(num_rows: RowId, row_ids: &[RowId]) -> OptionalIndex {
assert!( assert!(row_ids
row_ids .last()
.last() .copied()
.copied() .map(|last_row_id| last_row_id < num_rows)
.map(|last_row_id| last_row_id < num_rows) .unwrap_or(true));
.unwrap_or(true)
);
let mut buffer = Vec::new(); let mut buffer = Vec::new();
serialize_optional_index(&row_ids, num_rows, &mut buffer).unwrap(); serialize_optional_index(&row_ids, num_rows, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer); let bytes = OwnedBytes::new(buffer);
@@ -273,18 +271,17 @@ impl OptionalIndex {
} }
pub fn num_docs(&self) -> RowId { pub fn num_docs(&self) -> RowId {
self.num_docs self.num_rows
} }
pub fn num_non_nulls(&self) -> RowId { pub fn num_non_nulls(&self) -> RowId {
self.num_non_null_docs self.num_non_null_rows
} }
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ { pub fn iter_rows(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize. We could iterate over the blocks directly. // TODO optimize
// We use the dense value ids and retrieve the doc ids via select.
let mut select_batch = self.select_cursor(); let mut select_batch = self.select_cursor();
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank)) (0..self.num_non_null_rows).map(move |rank| select_batch.select(rank))
} }
pub fn select_batch(&self, ranks: &mut [RowId]) { pub fn select_batch(&self, ranks: &mut [RowId]) {
let mut select_cursor = self.select_cursor(); let mut select_cursor = self.select_cursor();
@@ -335,6 +332,38 @@ enum Block<'a> {
Sparse(SparseBlock<'a>), Sparse(SparseBlock<'a>),
} }
#[derive(Debug, Copy, Clone)]
enum OptionalIndexCodec {
Dense = 0,
Sparse = 1,
}
impl OptionalIndexCodec {
fn to_code(self) -> u8 {
self as u8
}
fn try_from_code(code: u8) -> Result<Self, InvalidData> {
match code {
0 => Ok(Self::Dense),
1 => Ok(Self::Sparse),
_ => Err(InvalidData),
}
}
}
impl BinarySerializable for OptionalIndexCodec {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&[self.to_code()])
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let optional_codec_code = u8::deserialize(reader)?;
let optional_codec = Self::try_from_code(optional_codec_code)?;
Ok(optional_codec)
}
}
fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> { fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> {
let is_sparse = is_sparse(block_els.len() as u32); let is_sparse = is_sparse(block_els.len() as u32);
if is_sparse { if is_sparse {
@@ -476,7 +505,7 @@ fn deserialize_optional_index_block_metadatas(
non_null_rows_before_block += num_non_null_rows; non_null_rows_before_block += num_non_null_rows;
} }
block_metas.resize( block_metas.resize(
num_rows.div_ceil(ELEMENTS_PER_BLOCK) as usize, ((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize,
BlockMeta { BlockMeta {
non_null_rows_before_block, non_null_rows_before_block,
start_byte_offset, start_byte_offset,
@@ -490,15 +519,15 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
let (mut bytes, num_non_empty_blocks_bytes) = bytes.rsplit(2); let (mut bytes, num_non_empty_blocks_bytes) = bytes.rsplit(2);
let num_non_empty_block_bytes = let num_non_empty_block_bytes =
u16::from_le_bytes(num_non_empty_blocks_bytes.as_slice().try_into().unwrap()); u16::from_le_bytes(num_non_empty_blocks_bytes.as_slice().try_into().unwrap());
let num_docs = VInt::deserialize_u64(&mut bytes)? as u32; let num_rows = VInt::deserialize_u64(&mut bytes)? as u32;
let block_metas_num_bytes = let block_metas_num_bytes =
num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES; num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES;
let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes); let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes);
let (block_metas, num_non_null_docs) = let (block_metas, num_non_null_rows) =
deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_docs); deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows);
let optional_index = OptionalIndex { let optional_index = OptionalIndex {
num_docs, num_rows,
num_non_null_docs, num_non_null_rows,
block_data, block_data,
block_metas: block_metas.into(), block_metas: block_metas.into(),
}; };

View File

@@ -2,7 +2,7 @@ use std::io::{self, Write};
use common::BinarySerializable; use common::BinarySerializable;
use crate::column_index::optional_index::{ELEMENTS_PER_BLOCK, SelectCursor, Set, SetCodec}; use crate::column_index::optional_index::{SelectCursor, Set, SetCodec, ELEMENTS_PER_BLOCK};
#[inline(always)] #[inline(always)]
fn get_bit_at(input: u64, n: u16) -> bool { fn get_bit_at(input: u64, n: u16) -> bool {
@@ -23,6 +23,7 @@ fn set_bit_at(input: &mut u64, n: u16) {
/// ///
/// When translating a dense index to the original index, we can use the offset to find the correct /// When translating a dense index to the original index, we can use the offset to find the correct
/// block. Direct computation is not possible, but we can employ a linear or binary search. /// block. Direct computation is not possible, but we can employ a linear or binary search.
const ELEMENTS_PER_MINI_BLOCK: u16 = 64; const ELEMENTS_PER_MINI_BLOCK: u16 = 64;
const MINI_BLOCK_BITVEC_NUM_BYTES: usize = 8; const MINI_BLOCK_BITVEC_NUM_BYTES: usize = 8;
const MINI_BLOCK_OFFSET_NUM_BYTES: usize = 2; const MINI_BLOCK_OFFSET_NUM_BYTES: usize = 2;
@@ -108,7 +109,7 @@ pub struct DenseBlockSelectCursor<'a> {
dense_block: DenseBlock<'a>, dense_block: DenseBlock<'a>,
} }
impl SelectCursor<u16> for DenseBlockSelectCursor<'_> { impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
#[inline] #[inline]
fn select(&mut self, rank: u16) -> u16 { fn select(&mut self, rank: u16) -> u16 {
self.block_id = self self.block_id = self
@@ -174,7 +175,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
} }
} }
impl DenseBlock<'_> { impl<'a> DenseBlock<'a> {
#[inline] #[inline]
fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock { fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock {
let data_start_pos = mini_block_id as usize * MINI_BLOCK_NUM_BYTES; let data_start_pos = mini_block_id as usize * MINI_BLOCK_NUM_BYTES;

View File

@@ -1,7 +1,7 @@
mod dense; mod dense;
mod sparse; mod sparse;
pub use dense::{DENSE_BLOCK_NUM_BYTES, DenseBlock, DenseBlockCodec}; pub use dense::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
pub use sparse::{SparseBlock, SparseBlockCodec}; pub use sparse::{SparseBlock, SparseBlockCodec};
#[cfg(test)] #[cfg(test)]

View File

@@ -31,7 +31,7 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
} }
} }
impl Set<u16> for SparseBlock<'_> { impl<'a> Set<u16> for SparseBlock<'a> {
type SelectCursor<'b> type SelectCursor<'b>
= Self = Self
where Self: 'b; where Self: 'b;
@@ -69,7 +69,7 @@ fn get_u16(data: &[u8], byte_position: usize) -> u16 {
u16::from_le_bytes(bytes) u16::from_le_bytes(bytes)
} }
impl SparseBlock<'_> { impl<'a> SparseBlock<'a> {
#[inline(always)] #[inline(always)]
fn value_at_idx(&self, data: &[u8], idx: u16) -> u16 { fn value_at_idx(&self, data: &[u8], idx: u16) -> u16 {
let start_offset: usize = idx as usize * 2; let start_offset: usize = idx as usize * 2;
@@ -82,7 +82,7 @@ impl SparseBlock<'_> {
} }
#[inline] #[inline]
#[expect(clippy::comparison_chain)] #[allow(clippy::comparison_chain)]
// Looks for the element in the block. Returns the positions if found. // Looks for the element in the block. Returns the positions if found.
fn binary_search(&self, target: u16) -> Result<u16, u16> { fn binary_search(&self, target: u16) -> Result<u16, u16> {
let data = &self.0; let data = &self.0;

View File

@@ -164,11 +164,7 @@ fn test_optional_index_large() {
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) { fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
let optional_index = OptionalIndex::for_test(num_rows, row_ids); let optional_index = OptionalIndex::for_test(num_rows, row_ids);
assert_eq!(optional_index.num_docs(), num_rows); assert_eq!(optional_index.num_docs(), num_rows);
assert!( assert!(optional_index.iter_rows().eq(row_ids.iter().copied()));
optional_index
.iter_non_null_docs()
.eq(row_ids.iter().copied())
);
} }
#[test] #[test]
@@ -223,3 +219,174 @@ fn test_optional_index_for_tests() {
assert!(!optional_index.contains(3)); assert!(!optional_index.contains(3));
assert_eq!(optional_index.num_docs(), 4); assert_eq!(optional_index.num_docs(), 4);
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::*;
const TOTAL_NUM_VALUES: u32 = 1_000_000;
fn gen_bools(fill_ratio: f64) -> OptionalIndex {
let mut out = Vec::new();
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.map(|(pos, _)| pos as RowId)
.collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap()
}
fn random_range_iterator(
start: u32,
end: u32,
avg_step_size: u32,
avg_deviation: u32,
) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start;
std::iter::from_fn(move || {
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
if current >= end {
None
} else {
Some(current)
}
})
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)
}
fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
walk_over_data_from_positions(
codec,
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
)
}
fn walk_over_data_from_positions(
codec: &OptionalIndex,
positions: impl Iterator<Item = u32>,
) -> Option<u32> {
let mut dense_idx: Option<u32> = None;
for idx in positions {
dense_idx = dense_idx.or(codec.rank_if_exists(idx));
}
dense_idx
}
#[bench]
fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 1000));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.5f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.01f64, 0.005f32, bench);
}
#[bench]
fn bench_translate_codec_to_orig_10percent_filled_0comma005percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.1f64, 0.005f32, bench);
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.01f64, 10f32, bench);
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.01f64, 100f32, bench);
}
fn bench_translate_codec_to_orig_util(
percent_filled: f64,
percent_hit: f32,
bench: &mut Bencher,
) {
let codec = gen_bools(percent_filled);
let num_non_nulls = codec.num_non_nulls();
let idxs: Vec<u32> = if percent_hit == 100.0f32 {
(0..num_non_nulls).collect()
} else {
n_percent_step_iterator(percent_hit, num_non_nulls).collect()
};
let mut output = vec![0u32; idxs.len()];
bench.iter(|| {
output.copy_from_slice(&idxs[..]);
codec.select_batch(&mut output);
});
}
#[bench]
fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.9f64, 0.005, bench);
}
#[bench]
fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.9f64, 100.0f32, bench);
}
}

View File

@@ -3,11 +3,11 @@ use std::io::Write;
use common::{CountingWriter, OwnedBytes}; use common::{CountingWriter, OwnedBytes};
use super::OptionalIndex;
use super::multivalued_index::SerializableMultivalueIndex; use super::multivalued_index::SerializableMultivalueIndex;
use crate::column_index::ColumnIndex; use super::OptionalIndex;
use crate::column_index::multivalued_index::serialize_multivalued_index; use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::optional_index::serialize_optional_index; use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::ColumnIndex;
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{Cardinality, RowId, Version}; use crate::{Cardinality, RowId, Version};
@@ -31,7 +31,7 @@ pub enum SerializableColumnIndex<'a> {
Multivalued(SerializableMultivalueIndex<'a>), Multivalued(SerializableMultivalueIndex<'a>),
} }
impl SerializableColumnIndex<'_> { impl<'a> SerializableColumnIndex<'a> {
pub fn get_cardinality(&self) -> Cardinality { pub fn get_cardinality(&self) -> Cardinality {
match self { match self {
SerializableColumnIndex::Full => Cardinality::Full, SerializableColumnIndex::Full => Cardinality::Full,

View File

@@ -0,0 +1,139 @@
use std::sync::Arc;
use common::OwnedBytes;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::{self, Bencher};
use super::*;
use crate::column_values::u64_based::*;
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55000_u64)
.map(|num| num + rng.gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default();
for val in vals {
stats_collector.collect(val);
}
stats_collector.stats()
}
#[inline(never)]
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
let mut codec_serializer = Codec::estimator();
for val in data {
codec_serializer.collect(*val);
}
codec_serializer
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
.unwrap();
Codec::load(OwnedBytes::new(bytes)).unwrap()
}
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = get_reader_for_bench::<Codec>(data);
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
#[inline(never)]
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = Arc::new(get_reader_for_bench::<Codec>(data));
bench_get_dynamic_helper(b, col);
}
fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let stats = compute_stats(data.iter().cloned());
let mut bytes = Vec::new();
b.iter(|| {
bytes.clear();
let mut codec_serializer = Codec::estimator();
for val in data.iter().take(1024) {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
});
}
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
}

View File

@@ -10,7 +10,7 @@ pub(crate) struct MergedColumnValues<'a, T> {
pub(crate) merge_row_order: &'a MergeRowOrder, pub(crate) merge_row_order: &'a MergeRowOrder,
} }
impl<T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'_, T> { impl<'a, T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'a, T> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
match self.merge_row_order { match self.merge_row_order {
MergeRowOrder::Stack(_) => Box::new( MergeRowOrder::Stack(_) => Box::new(

View File

@@ -26,13 +26,13 @@ mod monotonic_column;
pub(crate) use merge::MergedColumnValues; pub(crate) use merge::MergedColumnValues;
pub use stats::ColumnStats; pub use stats::ColumnStats;
pub use u64_based::{
ALL_U64_CODEC_TYPES, CodecType, load_u64_based_column_values,
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
};
pub use u128_based::{ pub use u128_based::{
CompactSpaceU64Accessor, open_u128_as_compact_u64, open_u128_mapped, open_u128_as_compact_u64, open_u128_mapped, serialize_column_values_u128,
serialize_column_values_u128, CompactSpaceU64Accessor,
};
pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
}; };
pub use vec_column::VecColumn; pub use vec_column::VecColumn;
@@ -242,3 +242,6 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
.get_row_ids_for_value_range(range, doc_id_range, positions) .get_row_ids_for_value_range(range, doc_id_range, positions)
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench;

View File

@@ -2,8 +2,8 @@ use std::fmt::Debug;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
use crate::ColumnValues;
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn; use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
use crate::ColumnValues;
struct MonotonicMappingColumn<C, T, Input> { struct MonotonicMappingColumn<C, T, Input> {
from_column: C, from_column: C,
@@ -99,10 +99,10 @@ where
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::column_values::VecColumn;
use crate::column_values::monotonic_mapping::{ use crate::column_values::monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal, StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
}; };
use crate::column_values::VecColumn;
#[test] #[test]
fn test_monotonic_mapping_iter() { fn test_monotonic_mapping_iter() {

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
/// Monotonic maps a value to u128 value space /// Montonic maps a value to u128 value space
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space. /// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync { pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync {
/// Converts a value to u128. /// Converts a value to u128.

View File

@@ -185,10 +185,10 @@ impl CompactSpaceBuilder {
let mut covered_space = Vec::with_capacity(self.blanks.len()); let mut covered_space = Vec::with_capacity(self.blanks.len());
// beginning of the blanks // beginning of the blanks
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
&& *first_blank_start != 0 if *first_blank_start != 0 {
{ covered_space.push(0..=first_blank_start - 1);
covered_space.push(0..=first_blank_start - 1); }
} }
// Between the blanks // Between the blanks
@@ -202,10 +202,10 @@ impl CompactSpaceBuilder {
covered_space.extend(between_blanks); covered_space.extend(between_blanks);
// end of the blanks // end of the blanks
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
&& *last_blank_end != u128::MAX if *last_blank_end != u128::MAX {
{ covered_space.push(last_blank_end + 1..=u128::MAX);
covered_space.push(last_blank_end + 1..=u128::MAX); }
} }
if covered_space.is_empty() { if covered_space.is_empty() {

View File

@@ -24,8 +24,8 @@ use build_compact_space::get_compact_space;
use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128}; use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
use tantivy_bitpacker::{BitPacker, BitUnpacker}; use tantivy_bitpacker::{BitPacker, BitUnpacker};
use crate::RowId;
use crate::column_values::ColumnValues; use crate::column_values::ColumnValues;
use crate::RowId;
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
/// blanks depends on the number of blanks. /// blanks depends on the number of blanks.
@@ -653,14 +653,12 @@ mod tests {
), ),
&[3] &[3]
); );
assert!( assert!(get_positions_for_value_range_helper(
get_positions_for_value_range_helper( &decomp,
&decomp, 99998u128..=99998u128,
99998u128..=99998u128, complete_range.clone()
complete_range.clone() )
) .is_empty());
.is_empty()
);
assert_eq!( assert_eq!(
&get_positions_for_value_range_helper( &get_positions_for_value_range_helper(
&decomp, &decomp,

View File

@@ -128,13 +128,13 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
} }
#[cfg(test)] #[cfg(test)]
pub(crate) mod tests { pub mod tests {
use super::*; use super::*;
use crate::column_values::CodecType;
use crate::column_values::u64_based::{ use crate::column_values::u64_based::{
ALL_U64_CODEC_TYPES, serialize_and_load_u64_based_column_values, serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
serialize_u64_based_column_values, ALL_U64_CODEC_TYPES,
}; };
use crate::column_values::CodecType;
#[test] #[test]
fn test_serialize_deserialize_u128_header() { fn test_serialize_deserialize_u128_header() {

View File

@@ -4,7 +4,7 @@ use std::ops::{Range, RangeInclusive};
use common::{BinarySerializable, OwnedBytes}; use common::{BinarySerializable, OwnedBytes};
use fastdivide::DividerU64; use fastdivide::DividerU64;
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats}; use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::{ColumnValues, RowId}; use crate::{ColumnValues, RowId};
@@ -23,7 +23,11 @@ const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
// copied from unstable rust standard library. // copied from unstable rust standard library.
let d = n / q.get(); let d = n / q.get();
let r = n % q.get(); let r = n % q.get();
if r > 0 { d + 1 } else { d } if r > 0 {
d + 1
} else {
d
}
} }
// The bitpacked codec applies a linear transformation `f` over data that are bitpacked. // The bitpacked codec applies a linear transformation `f` over data that are bitpacked.
@@ -41,6 +45,12 @@ fn transform_range_before_linear_transformation(
if range.is_empty() { if range.is_empty() {
return None; return None;
} }
if stats.min_value > *range.end() {
return None;
}
if stats.max_value < *range.start() {
return None;
}
let shifted_range = let shifted_range =
range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value); range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd); let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);
@@ -99,7 +109,7 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
fn estimate(&self, stats: &ColumnStats) -> Option<u64> { fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
let num_bits_per_value = num_bits(stats); let num_bits_per_value = num_bits(stats);
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64)).div_ceil(8)) Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
} }
fn serialize( fn serialize(

View File

@@ -4,12 +4,12 @@ use std::{io, iter};
use common::{BinarySerializable, CountingWriter, DeserializeFrom, OwnedBytes}; use common::{BinarySerializable, CountingWriter, DeserializeFrom, OwnedBytes};
use fastdivide::DividerU64; use fastdivide::DividerU64;
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::MonotonicallyMappableToU64;
use crate::column_values::u64_based::line::Line; use crate::column_values::u64_based::line::Line;
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats}; use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::column_values::{ColumnValues, VecColumn}; use crate::column_values::{ColumnValues, VecColumn};
use crate::MonotonicallyMappableToU64;
const BLOCK_SIZE: u32 = 512u32; const BLOCK_SIZE: u32 = 512u32;
@@ -39,7 +39,7 @@ impl BinarySerializable for Block {
} }
fn compute_num_blocks(num_vals: u32) -> u32 { fn compute_num_blocks(num_vals: u32) -> u32 {
num_vals.div_ceil(BLOCK_SIZE) (num_vals + BLOCK_SIZE - 1) / BLOCK_SIZE
} }
pub struct BlockwiseLinearEstimator { pub struct BlockwiseLinearEstimator {

View File

@@ -8,7 +8,7 @@ use crate::column_values::ColumnValues;
const MID_POINT: u64 = (1u64 << 32) - 1u64; const MID_POINT: u64 = (1u64 << 32) - 1u64;
/// `Line` describes a line function `y: ax + b` using integer /// `Line` describes a line function `y: ax + b` using integer
/// arithmetic. /// arithmetics.
/// ///
/// The slope is in fact a decimal split into a 32 bit integer value, /// The slope is in fact a decimal split into a 32 bit integer value,
/// and a 32-bit decimal value. /// and a 32-bit decimal value.
@@ -94,7 +94,7 @@ impl Line {
// `(i, ys[])`. // `(i, ys[])`.
// //
// The best intercept therefore has the form // The best intercept therefore has the form
// `y[i] - line.eval(i)` (using wrapping arithmetic). // `y[i] - line.eval(i)` (using wrapping arithmetics).
// In other words, the best intercept is one of the `y - Line::eval(ys[i])` // In other words, the best intercept is one of the `y - Line::eval(ys[i])`
// and our task is just to pick the one that minimizes our error. // and our task is just to pick the one that minimizes our error.
// //

View File

@@ -1,13 +1,13 @@
use std::io; use std::io;
use common::{BinarySerializable, OwnedBytes}; use common::{BinarySerializable, OwnedBytes};
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use super::ColumnValues;
use super::line::Line; use super::line::Line;
use crate::RowId; use super::ColumnValues;
use crate::column_values::VecColumn;
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats}; use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::column_values::VecColumn;
use crate::RowId;
const HALF_SPACE: u64 = u64::MAX / 2; const HALF_SPACE: u64 = u64::MAX / 2;
const LINE_ESTIMATION_BLOCK_LEN: usize = 512; const LINE_ESTIMATION_BLOCK_LEN: usize = 512;
@@ -117,7 +117,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
Some( Some(
stats.num_bytes() stats.num_bytes()
+ linear_params.num_bytes() + linear_params.num_bytes()
+ (num_bits as u64 * stats.num_rows as u64).div_ceil(8), + (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
) )
} }

View File

@@ -17,7 +17,7 @@ pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec; pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
pub use crate::column_values::u64_based::linear::LinearCodec; pub use crate::column_values::u64_based::linear::LinearCodec;
pub use crate::column_values::u64_based::stats_collector::StatsCollector; pub use crate::column_values::u64_based::stats_collector::StatsCollector;
use crate::column_values::{ColumnStats, monotonic_map_column}; use crate::column_values::{monotonic_map_column, ColumnStats};
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{ColumnValues, MonotonicallyMappableToU64}; use crate::{ColumnValues, MonotonicallyMappableToU64};
@@ -52,7 +52,7 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
) -> io::Result<()>; ) -> io::Result<()>;
} }
/// A column codec describes a column serialization format. /// A column codec describes a colunm serialization format.
pub trait ColumnCodec<T: PartialOrd = u64> { pub trait ColumnCodec<T: PartialOrd = u64> {
/// Specialized `ColumnValues` type. /// Specialized `ColumnValues` type.
type ColumnValues: ColumnValues<T> + 'static; type ColumnValues: ColumnValues<T> + 'static;

View File

@@ -2,8 +2,8 @@ use std::num::NonZeroU64;
use fastdivide::DividerU64; use fastdivide::DividerU64;
use crate::RowId;
use crate::column_values::ColumnStats; use crate::column_values::ColumnStats;
use crate::RowId;
/// Compute the gcd of two non null numbers. /// Compute the gcd of two non null numbers.
/// ///
@@ -96,8 +96,8 @@ impl StatsCollector {
mod tests { mod tests {
use std::num::NonZeroU64; use std::num::NonZeroU64;
use crate::column_values::u64_based::stats_collector::{compute_gcd, StatsCollector};
use crate::column_values::u64_based::ColumnStats; use crate::column_values::u64_based::ColumnStats;
use crate::column_values::u64_based::stats_collector::{StatsCollector, compute_gcd};
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats { fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default(); let mut stats_collector = StatsCollector::default();

View File

@@ -1,6 +1,5 @@
use proptest::prelude::*; use proptest::prelude::*;
use proptest::{prop_oneof, proptest}; use proptest::{prop_oneof, proptest};
use rand::Rng;
#[test] #[test]
fn test_serialize_and_load_simple() { fn test_serialize_and_load_simple() {

View File

@@ -4,8 +4,8 @@ use std::net::Ipv6Addr;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::InvalidData;
use crate::value::NumericalType; use crate::value::NumericalType;
use crate::InvalidData;
/// The column type represents the column type. /// The column type represents the column type.
/// Any changes need to be propagated to `COLUMN_TYPES`. /// Any changes need to be propagated to `COLUMN_TYPES`.

View File

@@ -3,7 +3,7 @@ use std::io::{self, Write};
use common::{BitSet, CountingWriter, ReadOnlyBitSet}; use common::{BitSet, CountingWriter, ReadOnlyBitSet};
use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable}; use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
use super::term_merger::{TermMerger, TermsWithSegmentOrd}; use super::term_merger::TermMerger;
use crate::column::serialize_column_mappable_to_u64; use crate::column::serialize_column_mappable_to_u64;
use crate::column_index::SerializableColumnIndex; use crate::column_index::SerializableColumnIndex;
use crate::iterable::Iterable; use crate::iterable::Iterable;
@@ -39,7 +39,7 @@ struct RemappedTermOrdinalsValues<'a> {
merge_row_order: &'a MergeRowOrder, merge_row_order: &'a MergeRowOrder,
} }
impl Iterable for RemappedTermOrdinalsValues<'_> { impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
match self.merge_row_order { match self.merge_row_order {
MergeRowOrder::Stack(_) => self.boxed_iter_stacked(), MergeRowOrder::Stack(_) => self.boxed_iter_stacked(),
@@ -50,7 +50,7 @@ impl Iterable for RemappedTermOrdinalsValues<'_> {
} }
} }
impl RemappedTermOrdinalsValues<'_> { impl<'a> RemappedTermOrdinalsValues<'a> {
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> { fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
let iter = self let iter = self
.bytes_columns .bytes_columns
@@ -126,17 +126,14 @@ fn serialize_merged_dict(
let mut term_ord_mapping = TermOrdinalMapping::default(); let mut term_ord_mapping = TermOrdinalMapping::default();
let mut field_term_streams = Vec::new(); let mut field_term_streams = Vec::new();
for (segment_ord, column_opt) in bytes_columns.iter().enumerate() { for column_opt in bytes_columns.iter() {
if let Some(column) = column_opt { if let Some(column) = column_opt {
term_ord_mapping.add_segment(column.dictionary.num_terms()); term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms: Streamer<VoidSSTable> = column.dictionary.stream()?; let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
field_term_streams.push(TermsWithSegmentOrd { terms, segment_ord }); field_term_streams.push(terms);
} else { } else {
term_ord_mapping.add_segment(0); term_ord_mapping.add_segment(0);
field_term_streams.push(TermsWithSegmentOrd { field_term_streams.push(Streamer::empty());
terms: Streamer::empty(),
segment_ord,
});
} }
} }
@@ -194,7 +191,6 @@ fn serialize_merged_dict(
#[derive(Default, Debug)] #[derive(Default, Debug)]
struct TermOrdinalMapping { struct TermOrdinalMapping {
/// Contains the new term ordinals for each segment.
per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>, per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>,
} }
@@ -209,6 +205,6 @@ impl TermOrdinalMapping {
} }
fn get_segment(&self, segment_ord: u32) -> &[TermOrdinal] { fn get_segment(&self, segment_ord: u32) -> &[TermOrdinal] {
&self.per_segment_new_term_ordinals[segment_ord as usize] &(self.per_segment_new_term_ordinals[segment_ord as usize])[..]
} }
} }

View File

@@ -26,7 +26,7 @@ impl StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len()); let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
let mut cumulated_row_id = 0; let mut cumulated_row_id = 0;
for columnar in columnars { for columnar in columnars {
cumulated_row_id += columnar.num_docs(); cumulated_row_id += columnar.num_rows();
cumulated_row_ids.push(cumulated_row_id); cumulated_row_ids.push(cumulated_row_id);
} }
StackMergeOrder { cumulated_row_ids } StackMergeOrder { cumulated_row_ids }

View File

@@ -10,11 +10,11 @@ use std::sync::Arc;
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder}; pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
use super::writer::ColumnarSerializer; use super::writer::ColumnarSerializer;
use crate::column::{serialize_column_mappable_to_u64, serialize_column_mappable_to_u128}; use crate::column::{serialize_column_mappable_to_u128, serialize_column_mappable_to_u64};
use crate::column_values::MergedColumnValues; use crate::column_values::MergedColumnValues;
use crate::columnar::ColumnarReader;
use crate::columnar::merge::merge_dict_column::merge_bytes_or_str_column; use crate::columnar::merge::merge_dict_column::merge_bytes_or_str_column;
use crate::columnar::writer::CompatibleNumericalTypes; use crate::columnar::writer::CompatibleNumericalTypes;
use crate::columnar::ColumnarReader;
use crate::dynamic_column::DynamicColumn; use crate::dynamic_column::DynamicColumn;
use crate::{ use crate::{
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType, BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
@@ -80,12 +80,13 @@ pub fn merge_columnar(
output: &mut impl io::Write, output: &mut impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
let mut serializer = ColumnarSerializer::new(output); let mut serializer = ColumnarSerializer::new(output);
let num_docs_per_columnar = columnar_readers let num_rows_per_columnar = columnar_readers
.iter() .iter()
.map(|reader| reader.num_docs()) .map(|reader| reader.num_rows())
.collect::<Vec<u32>>(); .collect::<Vec<u32>>();
let columns_to_merge = group_columns_for_merge(columnar_readers, required_columns)?; let columns_to_merge =
group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
for res in columns_to_merge { for res in columns_to_merge {
let ((column_name, _column_type_category), grouped_columns) = res; let ((column_name, _column_type_category), grouped_columns) = res;
let grouped_columns = grouped_columns.open(&merge_row_order)?; let grouped_columns = grouped_columns.open(&merge_row_order)?;
@@ -93,18 +94,15 @@ pub fn merge_columnar(
continue; continue;
} }
let column_type_after_merge = grouped_columns.column_type_after_merge(); let column_type = grouped_columns.column_type_after_merge();
let mut columns = grouped_columns.columns; let mut columns = grouped_columns.columns;
// Make sure the number of columns is the same as the number of columnar readers. coerce_columns(column_type, &mut columns)?;
// Or num_docs_per_columnar would be incorrect.
assert_eq!(columns.len(), columnar_readers.len());
coerce_columns(column_type_after_merge, &mut columns)?;
let mut column_serializer = let mut column_serializer =
serializer.start_serialize_column(column_name.as_bytes(), column_type_after_merge); serializer.start_serialize_column(column_name.as_bytes(), column_type);
merge_column( merge_column(
column_type_after_merge, column_type,
&num_docs_per_columnar, &num_rows_per_columnar,
columns, columns,
&merge_row_order, &merge_row_order,
&mut column_serializer, &mut column_serializer,
@@ -130,7 +128,7 @@ fn dynamic_column_to_u64_monotonic(dynamic_column: DynamicColumn) -> Option<Colu
fn merge_column( fn merge_column(
column_type: ColumnType, column_type: ColumnType,
num_docs_per_column: &[u32], num_docs_per_column: &[u32],
columns_to_merge: Vec<Option<DynamicColumn>>, columns: Vec<Option<DynamicColumn>>,
merge_row_order: &MergeRowOrder, merge_row_order: &MergeRowOrder,
wrt: &mut impl io::Write, wrt: &mut impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
@@ -140,21 +138,20 @@ fn merge_column(
| ColumnType::F64 | ColumnType::F64
| ColumnType::DateTime | ColumnType::DateTime
| ColumnType::Bool => { | ColumnType::Bool => {
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len()); let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> = let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
Vec::with_capacity(columns_to_merge.len()); Vec::with_capacity(columns.len());
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() { for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
match dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic) { if let Some(Column { index: idx, values }) =
Some(Column { index: idx, values }) => { dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
column_indexes.push(idx); {
column_values.push(Some(values)); column_indexes.push(idx);
} column_values.push(Some(values));
None => { } else {
column_indexes.push(ColumnIndex::Empty { column_indexes.push(ColumnIndex::Empty {
num_docs: num_docs_per_column[i], num_docs: num_docs_per_column[i],
}); });
column_values.push(None); column_values.push(None);
}
} }
} }
let merged_column_index = let merged_column_index =
@@ -167,10 +164,10 @@ fn merge_column(
serialize_column_mappable_to_u64(merged_column_index, &merge_column_values, wrt)?; serialize_column_mappable_to_u64(merged_column_index, &merge_column_values, wrt)?;
} }
ColumnType::IpAddr => { ColumnType::IpAddr => {
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len()); let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
let mut column_values: Vec<Option<Arc<dyn ColumnValues<Ipv6Addr>>>> = let mut column_values: Vec<Option<Arc<dyn ColumnValues<Ipv6Addr>>>> =
Vec::with_capacity(columns_to_merge.len()); Vec::with_capacity(columns.len());
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() { for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
if let Some(DynamicColumn::IpAddr(Column { index: idx, values })) = if let Some(DynamicColumn::IpAddr(Column { index: idx, values })) =
dynamic_column_opt dynamic_column_opt
{ {
@@ -195,10 +192,9 @@ fn merge_column(
serialize_column_mappable_to_u128(merged_column_index, &merge_column_values, wrt)?; serialize_column_mappable_to_u128(merged_column_index, &merge_column_values, wrt)?;
} }
ColumnType::Bytes | ColumnType::Str => { ColumnType::Bytes | ColumnType::Str => {
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len()); let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
let mut bytes_columns: Vec<Option<BytesColumn>> = let mut bytes_columns: Vec<Option<BytesColumn>> = Vec::with_capacity(columns.len());
Vec::with_capacity(columns_to_merge.len()); for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
match dynamic_column_opt { match dynamic_column_opt {
Some(DynamicColumn::Str(str_column)) => { Some(DynamicColumn::Str(str_column)) => {
column_indexes.push(str_column.term_ord_column.index.clone()); column_indexes.push(str_column.term_ord_column.index.clone());
@@ -252,15 +248,13 @@ impl GroupedColumns {
if column_type.len() == 1 { if column_type.len() == 1 {
return column_type.into_iter().next().unwrap(); return column_type.into_iter().next().unwrap();
} }
// At the moment, only the numerical column type category has more than one possible // At the moment, only the numerical categorical column type has more than one possible
// column type. // column type.
assert!( assert!(self
self.columns .columns
.iter() .iter()
.flatten() .flatten()
.all(|el| ColumnTypeCategory::from(el.column_type()) .all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
== ColumnTypeCategory::Numerical)
);
merged_numerical_columns_type(self.columns.iter().flatten()).into() merged_numerical_columns_type(self.columns.iter().flatten()).into()
} }
} }
@@ -367,7 +361,7 @@ fn is_empty_after_merge(
ColumnIndex::Empty { .. } => true, ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0, ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => { ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() { for doc in optional_index.iter_rows() {
if alive_bitset.contains(doc) { if alive_bitset.contains(doc) {
return false; return false;
} }
@@ -397,6 +391,7 @@ fn is_empty_after_merge(
fn group_columns_for_merge<'a>( fn group_columns_for_merge<'a>(
columnar_readers: &'a [&'a ColumnarReader], columnar_readers: &'a [&'a ColumnarReader],
required_columns: &'a [(String, ColumnType)], required_columns: &'a [(String, ColumnType)],
_merge_row_order: &'a MergeRowOrder,
) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> { ) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new(); let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();

View File

@@ -5,29 +5,28 @@ use sstable::TermOrdinal;
use crate::Streamer; use crate::Streamer;
/// The terms of a column with the ordinal of the segment. pub struct HeapItem<'a> {
pub struct TermsWithSegmentOrd<'a> { pub streamer: Streamer<'a>,
pub terms: Streamer<'a>,
pub segment_ord: usize, pub segment_ord: usize,
} }
impl PartialEq for TermsWithSegmentOrd<'_> { impl<'a> PartialEq for HeapItem<'a> {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.segment_ord == other.segment_ord self.segment_ord == other.segment_ord
} }
} }
impl Eq for TermsWithSegmentOrd<'_> {} impl<'a> Eq for HeapItem<'a> {}
impl<'a> PartialOrd for TermsWithSegmentOrd<'a> { impl<'a> PartialOrd for HeapItem<'a> {
fn partial_cmp(&self, other: &TermsWithSegmentOrd<'a>) -> Option<Ordering> { fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl<'a> Ord for TermsWithSegmentOrd<'a> { impl<'a> Ord for HeapItem<'a> {
fn cmp(&self, other: &TermsWithSegmentOrd<'a>) -> Ordering { fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
(&other.terms.key(), &other.segment_ord).cmp(&(&self.terms.key(), &self.segment_ord)) (&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
} }
} }
@@ -38,32 +37,39 @@ impl<'a> Ord for TermsWithSegmentOrd<'a> {
/// - the term /// - the term
/// - a slice with the ordinal of the segments containing the terms. /// - a slice with the ordinal of the segments containing the terms.
pub struct TermMerger<'a> { pub struct TermMerger<'a> {
heap: BinaryHeap<TermsWithSegmentOrd<'a>>, heap: BinaryHeap<HeapItem<'a>>,
term_streams_with_segment: Vec<TermsWithSegmentOrd<'a>>, current_streamers: Vec<HeapItem<'a>>,
} }
impl<'a> TermMerger<'a> { impl<'a> TermMerger<'a> {
/// Stream of merged term dictionary /// Stream of merged term dictionary
pub fn new(term_streams_with_segment: Vec<TermsWithSegmentOrd<'a>>) -> TermMerger<'a> { pub fn new(streams: Vec<Streamer<'a>>) -> TermMerger<'a> {
TermMerger { TermMerger {
heap: BinaryHeap::new(), heap: BinaryHeap::new(),
term_streams_with_segment, current_streamers: streams
.into_iter()
.enumerate()
.map(|(ord, streamer)| HeapItem {
streamer,
segment_ord: ord,
})
.collect(),
} }
} }
pub(crate) fn matching_segments<'b: 'a>( pub(crate) fn matching_segments<'b: 'a>(
&'b self, &'b self,
) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> { ) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
self.term_streams_with_segment self.current_streamers
.iter() .iter()
.map(|heap_item| (heap_item.segment_ord, heap_item.terms.term_ord())) .map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
} }
fn advance_segments(&mut self) { fn advance_segments(&mut self) {
let streamers = &mut self.term_streams_with_segment; let streamers = &mut self.current_streamers;
let heap = &mut self.heap; let heap = &mut self.heap;
for mut heap_item in streamers.drain(..) { for mut heap_item in streamers.drain(..) {
if heap_item.terms.advance() { if heap_item.streamer.advance() {
heap.push(heap_item); heap.push(heap_item);
} }
} }
@@ -74,19 +80,18 @@ impl<'a> TermMerger<'a> {
/// False if there is none. /// False if there is none.
pub fn advance(&mut self) -> bool { pub fn advance(&mut self) -> bool {
self.advance_segments(); self.advance_segments();
match self.heap.pop() { if let Some(head) = self.heap.pop() {
Some(head) => { self.current_streamers.push(head);
self.term_streams_with_segment.push(head); while let Some(next_streamer) = self.heap.peek() {
while let Some(next_streamer) = self.heap.peek() { if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() { break;
break;
}
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
self.term_streams_with_segment.push(next_heap_it);
} }
true let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
self.current_streamers.push(next_heap_it);
} }
_ => false, true
} else {
false
} }
} }
@@ -96,6 +101,6 @@ impl<'a> TermMerger<'a> {
/// if and only if advance() has been called before /// if and only if advance() has been called before
/// and "true" was returned. /// and "true" was returned.
pub fn key(&self) -> &[u8] { pub fn key(&self) -> &[u8] {
self.term_streams_with_segment[0].terms.key() self.current_streamers[0].streamer.key()
} }
} }

View File

@@ -1,10 +1,7 @@
use itertools::Itertools; use itertools::Itertools;
use proptest::collection::vec;
use proptest::prelude::*;
use super::*; use super::*;
use crate::columnar::{ColumnarReader, MergeRowOrder, StackMergeOrder, merge_columnar}; use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
use crate::{Cardinality, ColumnarWriter, DynamicColumn, HasAssociatedColumnType, RowId};
fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>( fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
column_name: &str, column_name: &str,
@@ -29,8 +26,9 @@ fn test_column_coercion_to_u64() {
// u64 type // u64 type
let columnar2 = make_columnar("numbers", &[u64::MAX]); let columnar2 = make_columnar("numbers", &[u64::MAX]);
let columnars = &[&columnar1, &columnar2]; let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[]).unwrap(); group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1); assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical))); assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
} }
@@ -40,8 +38,9 @@ fn test_column_coercion_to_i64() {
let columnar1 = make_columnar("numbers", &[-1i64]); let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers", &[2u64]); let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2]; let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[]).unwrap(); group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1); assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical))); assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
} }
@@ -64,8 +63,14 @@ fn test_group_columns_with_required_column() {
let columnar1 = make_columnar("numbers", &[1i64]); let columnar1 = make_columnar("numbers", &[1i64]);
let columnar2 = make_columnar("numbers", &[2u64]); let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2]; let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[("numbers".to_string(), ColumnType::U64)]).unwrap(); group_columns_for_merge(
&[&columnar1, &columnar2],
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1); assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical))); assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
} }
@@ -75,9 +80,13 @@ fn test_group_columns_required_column_with_no_existing_columns() {
let columnar1 = make_columnar("numbers", &[2u64]); let columnar1 = make_columnar("numbers", &[2u64]);
let columnar2 = make_columnar("numbers", &[2u64]); let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2]; let columnars = &[&columnar1, &columnar2];
let column_map: BTreeMap<_, _> = let merge_order = StackMergeOrder::stack(columnars).into();
group_columns_for_merge(columnars, &[("required_col".to_string(), ColumnType::Str)]) let column_map: BTreeMap<_, _> = group_columns_for_merge(
.unwrap(); columnars,
&[("required_col".to_string(), ColumnType::Str)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 2); assert_eq!(column_map.len(), 2);
let columns = &column_map let columns = &column_map
.get(&("required_col".to_string(), ColumnTypeCategory::Str)) .get(&("required_col".to_string(), ColumnTypeCategory::Str))
@@ -93,8 +102,14 @@ fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_ru
let columnar1 = make_columnar("numbers", &[2i64]); let columnar1 = make_columnar("numbers", &[2i64]);
let columnar2 = make_columnar("numbers", &[2i64]); let columnar2 = make_columnar("numbers", &[2i64]);
let columnars = &[&columnar1, &columnar2]; let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[("numbers".to_string(), ColumnType::U64)]).unwrap(); group_columns_for_merge(
columnars,
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1); assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical))); assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
} }
@@ -104,8 +119,9 @@ fn test_missing_column() {
let columnar1 = make_columnar("numbers", &[-1i64]); let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers2", &[2u64]); let columnar2 = make_columnar("numbers2", &[2u64]);
let columnars = &[&columnar1, &columnar2]; let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[]).unwrap(); group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 2); assert_eq!(column_map.len(), 2);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical))); assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
{ {
@@ -208,7 +224,7 @@ fn test_merge_columnar_numbers() {
) )
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_docs(), 3); assert_eq!(columnar_reader.num_rows(), 3);
assert_eq!(columnar_reader.num_columns(), 1); assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("numbers").unwrap(); let cols = columnar_reader.read_columns("numbers").unwrap();
let dynamic_column = cols[0].open().unwrap(); let dynamic_column = cols[0].open().unwrap();
@@ -236,7 +252,7 @@ fn test_merge_columnar_texts() {
) )
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_docs(), 3); assert_eq!(columnar_reader.num_rows(), 3);
assert_eq!(columnar_reader.num_columns(), 1); assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("texts").unwrap(); let cols = columnar_reader.read_columns("texts").unwrap();
let dynamic_column = cols[0].open().unwrap(); let dynamic_column = cols[0].open().unwrap();
@@ -285,7 +301,7 @@ fn test_merge_columnar_byte() {
) )
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_docs(), 4); assert_eq!(columnar_reader.num_rows(), 4);
assert_eq!(columnar_reader.num_columns(), 1); assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("bytes").unwrap(); let cols = columnar_reader.read_columns("bytes").unwrap();
let dynamic_column = cols[0].open().unwrap(); let dynamic_column = cols[0].open().unwrap();
@@ -341,7 +357,7 @@ fn test_merge_columnar_byte_with_missing() {
) )
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_docs(), 3 + 2 + 3); assert_eq!(columnar_reader.num_rows(), 3 + 2 + 3);
assert_eq!(columnar_reader.num_columns(), 2); assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("col").unwrap(); let cols = columnar_reader.read_columns("col").unwrap();
let dynamic_column = cols[0].open().unwrap(); let dynamic_column = cols[0].open().unwrap();
@@ -393,7 +409,7 @@ fn test_merge_columnar_different_types() {
) )
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_docs(), 4); assert_eq!(columnar_reader.num_rows(), 4);
assert_eq!(columnar_reader.num_columns(), 2); assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("mixed").unwrap(); let cols = columnar_reader.read_columns("mixed").unwrap();
@@ -403,11 +419,11 @@ fn test_merge_columnar_different_types() {
panic!() panic!()
}; };
assert_eq!(vals.get_cardinality(), Cardinality::Optional); assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.values_for_doc(0).collect_vec(), Vec::<i64>::new()); assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(1).collect_vec(), Vec::<i64>::new()); assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(2).collect_vec(), Vec::<i64>::new()); assert_eq!(vals.values_for_doc(2).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(3).collect_vec(), vec![1]); assert_eq!(vals.values_for_doc(3).collect_vec(), vec![1]);
assert_eq!(vals.values_for_doc(4).collect_vec(), Vec::<i64>::new()); assert_eq!(vals.values_for_doc(4).collect_vec(), vec![]);
// text column // text column
let dynamic_column = cols[1].open().unwrap(); let dynamic_column = cols[1].open().unwrap();
@@ -458,7 +474,7 @@ fn test_merge_columnar_different_empty_cardinality() {
) )
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_docs(), 2); assert_eq!(columnar_reader.num_rows(), 2);
assert_eq!(columnar_reader.num_columns(), 2); assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("mixed").unwrap(); let cols = columnar_reader.read_columns("mixed").unwrap();
@@ -470,119 +486,3 @@ fn test_merge_columnar_different_empty_cardinality() {
let dynamic_column = cols[1].open().unwrap(); let dynamic_column = cols[1].open().unwrap();
assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional); assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional);
} }
#[derive(Debug, Clone)]
struct ColumnSpec {
column_name: String,
/// (row_id, term)
terms: Vec<(RowId, Vec<u8>)>,
}
#[derive(Clone, Debug)]
struct ColumnarSpec {
columns: Vec<ColumnSpec>,
}
/// Generate a random (row_id, term) pair:
/// - row_id in [0..10]
/// - term is either from POSSIBLE_TERMS or random bytes
fn rowid_and_term_strategy() -> impl Strategy<Value = (RowId, Vec<u8>)> {
const POSSIBLE_TERMS: &[&[u8]] = &[b"a", b"b", b"allo"];
let term_strat = prop_oneof![
// pick from the fixed list
(0..POSSIBLE_TERMS.len()).prop_map(|i| POSSIBLE_TERMS[i].to_vec()),
// or random bytes (length 0..10)
prop::collection::vec(any::<u8>(), 0..10),
];
(0u32..11, term_strat)
}
/// Generate one ColumnSpec, with a random name and a random list of (row_id, term).
/// We sort it by row_id so that data is in ascending order.
fn column_spec_strategy() -> impl Strategy<Value = ColumnSpec> {
let column_name = prop_oneof![
Just("col".to_string()),
Just("col2".to_string()),
"col.*".prop_map(|s| s),
];
// We'll produce 0..8 (rowid,term) entries for this column
let data_strat = vec(rowid_and_term_strategy(), 0..8).prop_map(|mut pairs| {
// Sort by row_id
pairs.sort_by_key(|(row_id, _)| *row_id);
pairs
});
(column_name, data_strat).prop_map(|(name, data)| ColumnSpec {
column_name: name,
terms: data,
})
}
/// Strategy to generate an ColumnarSpec
fn columnar_strategy() -> impl Strategy<Value = ColumnarSpec> {
vec(column_spec_strategy(), 0..3).prop_map(|columns| ColumnarSpec { columns })
}
/// Strategy to generate multiple ColumnarSpecs, each of which we will treat
/// as one "columnar" to be merged together.
fn columnars_strategy() -> impl Strategy<Value = Vec<ColumnarSpec>> {
vec(columnar_strategy(), 1..4)
}
/// Build a `ColumnarReader` from a `ColumnarSpec`
fn build_columnar(spec: &ColumnarSpec) -> ColumnarReader {
let mut writer = ColumnarWriter::default();
let mut max_row_id = 0;
for col in &spec.columns {
for &(row_id, ref term) in &col.terms {
writer.record_bytes(row_id, &col.column_name, term);
max_row_id = max_row_id.max(row_id);
}
}
let mut buffer = Vec::new();
writer.serialize(max_row_id + 1, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
}
proptest! {
// We just test that the merge_columnar function doesn't crash.
#![proptest_config(ProptestConfig::with_cases(256))]
#[test]
fn test_merge_columnar_bytes_no_crash(columnars in columnars_strategy(), second_merge_columnars in columnars_strategy()) {
let columnars: Vec<ColumnarReader> = columnars.iter()
.map(build_columnar)
.collect();
let mut out = Vec::new();
let columnar_refs: Vec<&ColumnarReader> = columnars.iter().collect();
let stack_merge_order = StackMergeOrder::stack(&columnar_refs);
merge_columnar(
&columnar_refs,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut out,
).unwrap();
let merged_reader = ColumnarReader::open(out).unwrap();
// Merge the second set of columnars with the result of the first merge
let mut columnars: Vec<ColumnarReader> = second_merge_columnars.iter()
.map(build_columnar)
.collect();
columnars.push(merged_reader);
let mut out = Vec::new();
let columnar_refs: Vec<&ColumnarReader> = columnars.iter().collect();
let stack_merge_order = StackMergeOrder::stack(&columnar_refs);
merge_columnar(
&columnar_refs,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut out,
).unwrap();
}
}

View File

@@ -5,9 +5,9 @@ mod reader;
mod writer; mod writer;
pub use column_type::{ColumnType, HasAssociatedColumnType}; pub use column_type::{ColumnType, HasAssociatedColumnType};
pub use format_version::{CURRENT_VERSION, Version}; pub use format_version::{Version, CURRENT_VERSION};
#[cfg(test)] #[cfg(test)]
pub(crate) use merge::ColumnTypeCategory; pub(crate) use merge::ColumnTypeCategory;
pub use merge::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, merge_columnar}; pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
pub use reader::ColumnarReader; pub use reader::ColumnarReader;
pub use writer::ColumnarWriter; pub use writer::ColumnarWriter;

View File

@@ -1,11 +1,10 @@
use std::{fmt, io, mem}; use std::{fmt, io, mem};
use common::BinarySerializable;
use common::file_slice::FileSlice; use common::file_slice::FileSlice;
use common::json_path_writer::JSON_PATH_SEGMENT_SEP; use common::BinarySerializable;
use sstable::{Dictionary, RangeSSTable}; use sstable::{Dictionary, RangeSSTable};
use crate::columnar::{ColumnType, format_version}; use crate::columnar::{format_version, ColumnType};
use crate::dynamic_column::DynamicColumnHandle; use crate::dynamic_column::DynamicColumnHandle;
use crate::{RowId, Version}; use crate::{RowId, Version};
@@ -19,13 +18,13 @@ fn io_invalid_data(msg: String) -> io::Error {
pub struct ColumnarReader { pub struct ColumnarReader {
column_dictionary: Dictionary<RangeSSTable>, column_dictionary: Dictionary<RangeSSTable>,
column_data: FileSlice, column_data: FileSlice,
num_docs: RowId, num_rows: RowId,
format_version: Version, format_version: Version,
} }
impl fmt::Debug for ColumnarReader { impl fmt::Debug for ColumnarReader {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let num_rows = self.num_docs(); let num_rows = self.num_rows();
let columns = self.list_columns().unwrap(); let columns = self.list_columns().unwrap();
let num_cols = columns.len(); let num_cols = columns.len();
let mut debug_struct = f.debug_struct("Columnar"); let mut debug_struct = f.debug_struct("Columnar");
@@ -77,19 +76,6 @@ fn read_all_columns_in_stream(
Ok(results) Ok(results)
} }
fn column_dictionary_prefix_for_column_name(column_name: &str) -> String {
// Each column is a associated to a given `column_key`,
// that starts by `column_name\0column_header`.
//
// Listing the columns associated to the given column name is therefore equivalent to
// listing `column_key` with the prefix `column_name\0`.
format!("{}{}", column_name, '\0')
}
fn column_dictionary_prefix_for_subpath(root_path: &str) -> String {
format!("{}{}", root_path, JSON_PATH_SEGMENT_SEP as char)
}
impl ColumnarReader { impl ColumnarReader {
/// Opens a new Columnar file. /// Opens a new Columnar file.
pub fn open<F>(file_slice: F) -> io::Result<ColumnarReader> pub fn open<F>(file_slice: F) -> io::Result<ColumnarReader>
@@ -112,13 +98,13 @@ impl ColumnarReader {
Ok(ColumnarReader { Ok(ColumnarReader {
column_dictionary, column_dictionary,
column_data, column_data,
num_docs: num_rows, num_rows,
format_version, format_version,
}) })
} }
pub fn num_docs(&self) -> RowId { pub fn num_rows(&self) -> RowId {
self.num_docs self.num_rows
} }
// Iterate over the columns in a sorted way // Iterate over the columns in a sorted way
pub fn iter_columns( pub fn iter_columns(
@@ -158,14 +144,32 @@ impl ColumnarReader {
Ok(self.iter_columns()?.collect()) Ok(self.iter_columns()?.collect())
} }
fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {
// Each column is a associated to a given `column_key`,
// that starts by `column_name\0column_header`.
//
// Listing the columns associated to the given column name is therefore equivalent to
// listing `column_key` with the prefix `column_name\0`.
//
// This is in turn equivalent to searching for the range
// `[column_name,\0`..column_name\1)`.
// TODO can we get some more generic `prefix(..)` logic in the dictionary.
let mut start_key = column_name.to_string();
start_key.push('\0');
let mut end_key = column_name.to_string();
end_key.push(1u8 as char);
self.column_dictionary
.range()
.ge(start_key.as_bytes())
.lt(end_key.as_bytes())
}
pub async fn read_columns_async( pub async fn read_columns_async(
&self, &self,
column_name: &str, column_name: &str,
) -> io::Result<Vec<DynamicColumnHandle>> { ) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_column_name(column_name);
let stream = self let stream = self
.column_dictionary .stream_for_column_range(column_name)
.prefix_range(prefix)
.into_stream_async() .into_stream_async()
.await?; .await?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version) read_all_columns_in_stream(stream, &self.column_data, self.format_version)
@@ -176,35 +180,7 @@ impl ColumnarReader {
/// There can be more than one column associated to a given column name, provided they have /// There can be more than one column associated to a given column name, provided they have
/// different types. /// different types.
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> { pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_column_name(column_name); let stream = self.stream_for_column_range(column_name).into_stream()?;
let stream = self.column_dictionary.prefix_range(prefix).into_stream()?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
pub async fn read_subpath_columns_async(
&self,
root_path: &str,
) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_subpath(root_path);
let stream = self
.column_dictionary
.prefix_range(prefix)
.into_stream_async()
.await?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
/// Get all inner columns for a given JSON prefix, i.e columns for which the name starts
/// with the prefix then contain the [`JSON_PATH_SEGMENT_SEP`].
///
/// There can be more than one column associated to each path within the JSON structure,
/// provided they have different types.
pub fn read_subpath_columns(&self, root_path: &str) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_subpath(root_path);
let stream = self
.column_dictionary
.prefix_range(prefix.as_bytes())
.into_stream()?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version) read_all_columns_in_stream(stream, &self.column_data, self.format_version)
} }
@@ -216,8 +192,6 @@ impl ColumnarReader {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
use crate::{ColumnType, ColumnarReader, ColumnarWriter}; use crate::{ColumnType, ColumnarReader, ColumnarWriter};
#[test] #[test]
@@ -250,64 +224,6 @@ mod tests {
assert_eq!(columns[0].1.column_type(), ColumnType::U64); assert_eq!(columns[0].1.column_type(), ColumnType::U64);
} }
#[test]
fn test_read_columns() {
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_column_type("col", ColumnType::U64, false);
columnar_writer.record_numerical(1, "col", 1u64);
let mut buffer = Vec::new();
columnar_writer.serialize(2, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
{
let columns = columnar.read_columns("col").unwrap();
assert_eq!(columns.len(), 1);
assert_eq!(columns[0].column_type(), ColumnType::U64);
}
{
let columns = columnar.read_columns("other").unwrap();
assert_eq!(columns.len(), 0);
}
}
#[test]
fn test_read_subpath_columns() {
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_str(
0,
&format!("col1{}subcol1", JSON_PATH_SEGMENT_SEP as char),
"hello",
);
columnar_writer.record_numerical(
0,
&format!("col1{}subcol2", JSON_PATH_SEGMENT_SEP as char),
1i64,
);
columnar_writer.record_str(1, "col1", "hello");
columnar_writer.record_str(0, "col2", "hello");
let mut buffer = Vec::new();
columnar_writer.serialize(2, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
{
let columns = columnar.read_subpath_columns("col1").unwrap();
assert_eq!(columns.len(), 2);
assert_eq!(columns[0].column_type(), ColumnType::Str);
assert_eq!(columns[1].column_type(), ColumnType::I64);
}
{
let columns = columnar.read_subpath_columns("col1.subcol1").unwrap();
assert_eq!(columns.len(), 0);
}
{
let columns = columnar.read_subpath_columns("col2").unwrap();
assert_eq!(columns.len(), 0);
}
{
let columns = columnar.read_subpath_columns("other").unwrap();
assert_eq!(columns.len(), 0);
}
}
#[test] #[test]
#[should_panic(expected = "Input type forbidden")] #[should_panic(expected = "Input type forbidden")]
fn test_list_columns_strict_typing_panics_on_wrong_types() { fn test_list_columns_strict_typing_panics_on_wrong_types() {

View File

@@ -122,6 +122,7 @@ impl<T> From<T> for ColumnOperation<T> {
// In order to limit memory usage, and in order // In order to limit memory usage, and in order
// to benefit from the stacker, we do this by serialization our data // to benefit from the stacker, we do this by serialization our data
// as "Symbols". // as "Symbols".
#[allow(clippy::from_over_into)]
pub(super) trait SymbolValue: Clone + Copy { pub(super) trait SymbolValue: Clone + Copy {
// Serializes the symbol into the given buffer. // Serializes the symbol into the given buffer.
// Returns the number of bytes written into the buffer. // Returns the number of bytes written into the buffer.
@@ -244,7 +245,7 @@ impl SymbolValue for UnorderedId {
fn compute_num_bytes_for_u64(val: u64) -> usize { fn compute_num_bytes_for_u64(val: u64) -> usize {
let msb = (64u32 - val.leading_zeros()) as usize; let msb = (64u32 - val.leading_zeros()) as usize;
msb.div_ceil(8) (msb + 7) / 8
} }
fn encode_zig_zag(n: i64) -> u64 { fn encode_zig_zag(n: i64) -> u64 {

View File

@@ -42,7 +42,7 @@ impl ColumnWriter {
&self, &self,
arena: &MemoryArena, arena: &MemoryArena,
buffer: &'a mut Vec<u8>, buffer: &'a mut Vec<u8>,
) -> impl Iterator<Item = ColumnOperation<V>> + 'a + use<'a, V> { ) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
buffer.clear(); buffer.clear();
self.values.read_to_end(arena, buffer); self.values.read_to_end(arena, buffer);
let mut cursor: &[u8] = &buffer[..]; let mut cursor: &[u8] = &buffer[..];
@@ -104,10 +104,9 @@ pub(crate) struct NumericalColumnWriter {
impl NumericalColumnWriter { impl NumericalColumnWriter {
pub fn force_numerical_type(&mut self, numerical_type: NumericalType) { pub fn force_numerical_type(&mut self, numerical_type: NumericalType) {
assert!( assert!(self
self.compatible_numerical_types .compatible_numerical_types
.is_type_accepted(numerical_type) .is_type_accepted(numerical_type));
);
self.compatible_numerical_types = CompatibleNumericalTypes::StaticType(numerical_type); self.compatible_numerical_types = CompatibleNumericalTypes::StaticType(numerical_type);
} }
} }
@@ -212,7 +211,7 @@ impl NumericalColumnWriter {
self, self,
arena: &MemoryArena, arena: &MemoryArena,
buffer: &'a mut Vec<u8>, buffer: &'a mut Vec<u8>,
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a + use<'a> { ) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
self.column_writer.operation_iterator(arena, buffer) self.column_writer.operation_iterator(arena, buffer)
} }
} }
@@ -256,7 +255,7 @@ impl StrOrBytesColumnWriter {
&self, &self,
arena: &MemoryArena, arena: &MemoryArena,
byte_buffer: &'a mut Vec<u8>, byte_buffer: &'a mut Vec<u8>,
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a + use<'a> { ) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
self.column_writer.operation_iterator(arena, byte_buffer) self.column_writer.operation_iterator(arena, byte_buffer)
} }
} }

View File

@@ -8,13 +8,13 @@ use std::net::Ipv6Addr;
use column_operation::ColumnOperation; use column_operation::ColumnOperation;
pub(crate) use column_writers::CompatibleNumericalTypes; pub(crate) use column_writers::CompatibleNumericalTypes;
use common::CountingWriter;
use common::json_path_writer::JSON_END_OF_PATH; use common::json_path_writer::JSON_END_OF_PATH;
use common::CountingWriter;
pub(crate) use serializer::ColumnarSerializer; pub(crate) use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena}; use stacker::{Addr, ArenaHashMap, MemoryArena};
use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex}; use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
use crate::column_values::{MonotonicallyMappableToU64, MonotonicallyMappableToU128}; use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use crate::columnar::column_type::ColumnType; use crate::columnar::column_type::ColumnType;
use crate::columnar::writer::column_writers::{ use crate::columnar::writer::column_writers::{
ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter, ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
@@ -285,6 +285,7 @@ impl ColumnarWriter {
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)), .map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
); );
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type)); columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries); let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new(); let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns { for (column_name, column_type, addr) in columns {
@@ -391,7 +392,7 @@ impl ColumnarWriter {
// Serialize [Dictionary, Column, dictionary num bytes U32::LE] // Serialize [Dictionary, Column, dictionary num bytes U32::LE]
// Column: [Column Index, Column Values, column index num bytes U32::LE] // Column: [Column Index, Column Values, column index num bytes U32::LE]
#[expect(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn serialize_bytes_or_str_column( fn serialize_bytes_or_str_column(
cardinality: Cardinality, cardinality: Cardinality,
num_docs: RowId, num_docs: RowId,

View File

@@ -3,11 +3,11 @@ use std::io::Write;
use common::json_path_writer::JSON_END_OF_PATH; use common::json_path_writer::JSON_END_OF_PATH;
use common::{BinarySerializable, CountingWriter}; use common::{BinarySerializable, CountingWriter};
use sstable::RangeSSTable;
use sstable::value::RangeValueWriter; use sstable::value::RangeValueWriter;
use sstable::RangeSSTable;
use crate::RowId;
use crate::columnar::ColumnType; use crate::columnar::ColumnType;
use crate::RowId;
pub struct ColumnarSerializer<W: io::Write> { pub struct ColumnarSerializer<W: io::Write> {
wrt: CountingWriter<W>, wrt: CountingWriter<W>,
@@ -67,7 +67,7 @@ pub struct ColumnSerializer<'a, W: io::Write> {
start_offset: u64, start_offset: u64,
} }
impl<W: io::Write> ColumnSerializer<'_, W> { impl<'a, W: io::Write> ColumnSerializer<'a, W> {
pub fn finalize(self) -> io::Result<()> { pub fn finalize(self) -> io::Result<()> {
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes(); let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
let byte_range = self.start_offset..end_offset; let byte_range = self.start_offset..end_offset;
@@ -80,7 +80,7 @@ impl<W: io::Write> ColumnSerializer<'_, W> {
} }
} }
impl<W: io::Write> io::Write for ColumnSerializer<'_, W> { impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.columnar_serializer.wrt.write(buf) self.columnar_serializer.wrt.write(buf)
} }

View File

@@ -1,6 +1,6 @@
use crate::RowId;
use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex}; use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::RowId;
/// The `IndexBuilder` interprets a sequence of /// The `IndexBuilder` interprets a sequence of
/// calls of the form: /// calls of the form:
@@ -31,13 +31,12 @@ pub struct OptionalIndexBuilder {
impl OptionalIndexBuilder { impl OptionalIndexBuilder {
pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ { pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ {
debug_assert!( debug_assert!(self
self.docs .docs
.last() .last()
.copied() .copied()
.map(|last_doc| last_doc < num_rows) .map(|last_doc| last_doc < num_rows)
.unwrap_or(true) .unwrap_or(true));
);
&self.docs[..] &self.docs[..]
} }
@@ -49,13 +48,12 @@ impl OptionalIndexBuilder {
impl IndexBuilder for OptionalIndexBuilder { impl IndexBuilder for OptionalIndexBuilder {
#[inline(always)] #[inline(always)]
fn record_row(&mut self, doc: RowId) { fn record_row(&mut self, doc: RowId) {
debug_assert!( debug_assert!(self
self.docs .docs
.last() .last()
.copied() .copied()
.map(|prev_doc| doc > prev_doc) .map(|prev_doc| doc > prev_doc)
.unwrap_or(true) .unwrap_or(true));
);
self.docs.push(doc); self.docs.push(doc);
} }
} }

View File

@@ -3,8 +3,8 @@ use std::path::PathBuf;
use itertools::Itertools; use itertools::Itertools;
use crate::{ use crate::{
CURRENT_VERSION, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder, merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
merge_columnar, CURRENT_VERSION,
}; };
const NUM_DOCS: u32 = u16::MAX as u32; const NUM_DOCS: u32 = u16::MAX as u32;

View File

@@ -3,11 +3,10 @@ use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};
use common::file_slice::FileSlice; use common::file_slice::FileSlice;
use common::{ByteCount, DateTime, OwnedBytes}; use common::{ByteCount, DateTime, HasLen, OwnedBytes};
use serde::{Deserialize, Serialize};
use crate::column::{BytesColumn, Column, StrColumn}; use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column}; use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
use crate::columnar::ColumnType; use crate::columnar::ColumnType;
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version}; use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
@@ -318,89 +317,10 @@ impl DynamicColumnHandle {
} }
pub fn num_bytes(&self) -> ByteCount { pub fn num_bytes(&self) -> ByteCount {
self.file_slice.num_bytes() self.file_slice.len().into()
}
/// Legacy helper returning the column space usage.
pub fn column_and_dictionary_num_bytes(&self) -> io::Result<ColumnSpaceUsage> {
self.space_usage()
}
/// Return the space usage of the column, optionally broken down by dictionary and column
/// values.
///
/// For dictionary encoded columns (strings and bytes), this splits the total footprint into
/// the dictionary and the remaining column data (including index and values).
/// For all other column types, the dictionary size is `None` and the column size
/// equals the total bytes.
pub fn space_usage(&self) -> io::Result<ColumnSpaceUsage> {
let total_num_bytes = self.num_bytes();
let dynamic_column = self.open()?;
let dictionary_num_bytes = match &dynamic_column {
DynamicColumn::Bytes(bytes_column) => bytes_column.dictionary().num_bytes(),
DynamicColumn::Str(str_column) => str_column.dictionary().num_bytes(),
_ => {
return Ok(ColumnSpaceUsage::new(self.num_bytes(), None));
}
};
assert!(dictionary_num_bytes <= total_num_bytes);
let column_num_bytes =
ByteCount::from(total_num_bytes.get_bytes() - dictionary_num_bytes.get_bytes());
Ok(ColumnSpaceUsage::new(
column_num_bytes,
Some(dictionary_num_bytes),
))
} }
pub fn column_type(&self) -> ColumnType { pub fn column_type(&self) -> ColumnType {
self.column_type self.column_type
} }
} }
/// Represents space usage of a column.
///
/// `column_num_bytes` tracks the column payload (index, values and footer).
/// For dictionary encoded columns, `dictionary_num_bytes` captures the dictionary footprint.
/// [`ColumnSpaceUsage::total_num_bytes`] returns the sum of both parts.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ColumnSpaceUsage {
column_num_bytes: ByteCount,
dictionary_num_bytes: Option<ByteCount>,
}
impl ColumnSpaceUsage {
pub(crate) fn new(
column_num_bytes: ByteCount,
dictionary_num_bytes: Option<ByteCount>,
) -> Self {
ColumnSpaceUsage {
column_num_bytes,
dictionary_num_bytes,
}
}
pub fn column_num_bytes(&self) -> ByteCount {
self.column_num_bytes
}
pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
self.dictionary_num_bytes
}
pub fn total_num_bytes(&self) -> ByteCount {
self.column_num_bytes + self.dictionary_num_bytes.unwrap_or_default()
}
/// Merge two space usage values by summing their components.
pub fn merge(&self, other: &ColumnSpaceUsage) -> ColumnSpaceUsage {
let dictionary_num_bytes = match (self.dictionary_num_bytes, other.dictionary_num_bytes) {
(Some(lhs), Some(rhs)) => Some(lhs + rhs),
(Some(val), None) | (None, Some(val)) => Some(val),
(None, None) => None,
};
ColumnSpaceUsage {
column_num_bytes: self.column_num_bytes + other.column_num_bytes,
dictionary_num_bytes,
}
}
}

View File

@@ -7,7 +7,7 @@ pub trait Iterable<T = u64> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>; fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
} }
impl<T: Copy> Iterable<T> for &[T] { impl<'a, T: Copy> Iterable<T> for &'a [T] {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
Box::new(self.iter().copied()) Box::new(self.iter().copied())
} }

View File

@@ -17,10 +17,15 @@
//! column. //! column.
//! - [column_values]: Stores the values of a column in a dense format. //! - [column_values]: Stores the values of a column in a dense format.
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(test)] #[cfg(test)]
#[macro_use] #[macro_use]
extern crate more_asserts; extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
use std::fmt::Display; use std::fmt::Display;
use std::io; use std::io;
@@ -39,16 +44,16 @@ pub use block_accessor::ColumnBlockAccessor;
pub use column::{BytesColumn, Column, StrColumn}; pub use column::{BytesColumn, Column, StrColumn};
pub use column_index::ColumnIndex; pub use column_index::ColumnIndex;
pub use column_values::{ pub use column_values::{
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU64, MonotonicallyMappableToU128, ColumnValues, EmptyColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
}; };
pub use columnar::{ pub use columnar::{
CURRENT_VERSION, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType, merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
}; };
use sstable::VoidSSTable; use sstable::VoidSSTable;
pub use value::{NumericalType, NumericalValue}; pub use value::{NumericalType, NumericalValue};
pub use self::dynamic_column::{ColumnSpaceUsage, DynamicColumn, DynamicColumnHandle}; pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32; pub type RowId = u32;
pub type DocId = u32; pub type DocId = u32;

View File

@@ -60,7 +60,7 @@ fn test_dataframe_writer_bool() {
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!(); panic!();
}; };
let vals: Vec<Option<bool>> = (0..5).map(|doc_id| bool_col.first(doc_id)).collect(); let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]); assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
} }
@@ -108,7 +108,7 @@ fn test_dataframe_writer_ip_addr() {
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!(); panic!();
}; };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|doc_id| ip_col.first(doc_id)).collect(); let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!( assert_eq!(
&vals, &vals,
&[ &[
@@ -169,7 +169,7 @@ fn test_dictionary_encoded_str() {
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!(); panic!();
}; };
let index: Vec<Option<u64>> = (0..5).map(|doc_id| str_col.ords().first(doc_id)).collect(); let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]); assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5); assert_eq!(str_col.num_rows(), 5);
let mut term_buffer = String::new(); let mut term_buffer = String::new();
@@ -204,7 +204,7 @@ fn test_dictionary_encoded_bytes() {
panic!(); panic!();
}; };
let index: Vec<Option<u64>> = (0..5) let index: Vec<Option<u64>> = (0..5)
.map(|doc_id| bytes_col.ords().first(doc_id)) .map(|row_id| bytes_col.ords().first(row_id))
.collect(); .collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]); assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(bytes_col.num_rows(), 5); assert_eq!(bytes_col.num_rows(), 5);
@@ -380,7 +380,7 @@ fn assert_columnar_eq(
right: &ColumnarReader, right: &ColumnarReader,
lenient_on_numerical_value: bool, lenient_on_numerical_value: bool,
) { ) {
assert_eq!(left.num_docs(), right.num_docs()); assert_eq!(left.num_rows(), right.num_rows());
let left_columns = left.list_columns().unwrap(); let left_columns = left.list_columns().unwrap();
let right_columns = right.list_columns().unwrap(); let right_columns = right.list_columns().unwrap();
assert_eq!(left_columns.len(), right_columns.len()); assert_eq!(left_columns.len(), right_columns.len());
@@ -588,7 +588,7 @@ proptest! {
#[test] #[test]
fn test_single_columnar_builder_proptest(docs in columnar_docs_strategy()) { fn test_single_columnar_builder_proptest(docs in columnar_docs_strategy()) {
let columnar = build_columnar(&docs[..]); let columnar = build_columnar(&docs[..]);
assert_eq!(columnar.num_docs() as usize, docs.len()); assert_eq!(columnar.num_rows() as usize, docs.len());
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default(); let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
for (doc_id, doc_vals) in docs.iter().enumerate() { for (doc_id, doc_vals) in docs.iter().enumerate() {
for (col_name, col_val) in doc_vals { for (col_name, col_val) in doc_vals {
@@ -715,9 +715,8 @@ fn test_columnar_merging_number_columns() {
// TODO test required_columns // TODO test required_columns
// TODO document edge case: required_columns incompatible with values. // TODO document edge case: required_columns incompatible with values.
#[allow(clippy::type_complexity)] fn columnar_docs_and_remap(
fn columnar_docs_and_remap() ) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
-> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map( proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
|columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| { |columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
let row_addrs: Vec<RowAddr> = columnars_docs let row_addrs: Vec<RowAddr> = columnars_docs
@@ -820,7 +819,7 @@ fn test_columnar_merge_empty() {
) )
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_docs(), 0); assert_eq!(merged_columnar.num_rows(), 0);
assert_eq!(merged_columnar.num_columns(), 0); assert_eq!(merged_columnar.num_columns(), 0);
} }
@@ -846,7 +845,7 @@ fn test_columnar_merge_single_str_column() {
) )
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_docs(), 1); assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1); assert_eq!(merged_columnar.num_columns(), 1);
} }
@@ -878,7 +877,7 @@ fn test_delete_decrease_cardinality() {
) )
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_docs(), 1); assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1); assert_eq!(merged_columnar.num_columns(), 1);
let cols = merged_columnar.read_columns("c").unwrap(); let cols = merged_columnar.read_columns("c").unwrap();
assert_eq!(cols.len(), 1); assert_eq!(cols.len(), 1);

View File

@@ -1,5 +1,3 @@
use std::str::FromStr;
use common::DateTime; use common::DateTime;
use crate::InvalidData; use crate::InvalidData;
@@ -11,23 +9,6 @@ pub enum NumericalValue {
F64(f64), F64(f64),
} }
impl FromStr for NumericalValue {
type Err = ();
fn from_str(s: &str) -> Result<Self, ()> {
if let Ok(val_i64) = s.parse::<i64>() {
return Ok(val_i64.into());
}
if let Ok(val_u64) = s.parse::<u64>() {
return Ok(val_u64.into());
}
if let Ok(val_f64) = s.parse::<f64>() {
return Ok(NumericalValue::from(val_f64).normalize());
}
Err(())
}
}
impl NumericalValue { impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType { pub fn numerical_type(&self) -> NumericalType {
match self { match self {
@@ -45,7 +26,7 @@ impl NumericalValue {
if val <= i64::MAX as u64 { if val <= i64::MAX as u64 {
NumericalValue::I64(val as i64) NumericalValue::I64(val as i64)
} else { } else {
NumericalValue::U64(val) NumericalValue::F64(val as f64)
} }
} }
NumericalValue::I64(val) => NumericalValue::I64(val), NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -160,7 +141,6 @@ impl Coerce for DateTime {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::NumericalType; use super::NumericalType;
use crate::NumericalValue;
#[test] #[test]
fn test_numerical_type_code() { fn test_numerical_type_code() {
@@ -173,58 +153,4 @@ mod tests {
} }
assert_eq!(num_numerical_type, 3); assert_eq!(num_numerical_type, 3);
} }
#[test]
fn test_parse_numerical() {
assert_eq!(
"123".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(123)
);
assert_eq!(
"18446744073709551615".parse::<NumericalValue>().unwrap(),
NumericalValue::U64(18446744073709551615u64)
);
assert_eq!(
"1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(1i64)
);
assert_eq!(
"1.1".parse::<NumericalValue>().unwrap(),
NumericalValue::F64(1.1f64)
);
assert_eq!(
"-1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(-1i64)
);
}
#[test]
fn test_normalize_numerical() {
assert_eq!(
NumericalValue::from(1u64).normalize(),
NumericalValue::I64(1i64),
);
let limit_val = i64::MAX as u64 + 1u64;
assert_eq!(
NumericalValue::from(limit_val).normalize(),
NumericalValue::U64(limit_val),
);
assert_eq!(
NumericalValue::from(-1i64).normalize(),
NumericalValue::I64(-1i64),
);
assert_eq!(
NumericalValue::from(-2.0f64).normalize(),
NumericalValue::I64(-2i64),
);
assert_eq!(
NumericalValue::from(-2.1f64).normalize(),
NumericalValue::F64(-2.1f64),
);
let large_float = 2.0f64.powf(70.0f64);
assert_eq!(
NumericalValue::from(large_float).normalize(),
NumericalValue::F64(large_float),
);
}
} }

View File

@@ -1,9 +1,9 @@
[package] [package]
name = "tantivy-common" name = "tantivy-common"
version = "0.10.0" version = "0.7.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"] authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT" license = "MIT"
edition = "2024" edition = "2021"
description = "common traits and utility functions used by multiple tantivy subcrates" description = "common traits and utility functions used by multiple tantivy subcrates"
documentation = "https://docs.rs/tantivy_common/" documentation = "https://docs.rs/tantivy_common/"
homepage = "https://github.com/quickwit-oss/tantivy" homepage = "https://github.com/quickwit-oss/tantivy"
@@ -13,13 +13,13 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies] [dependencies]
byteorder = "1.4.3" byteorder = "1.4.3"
ownedbytes = { version= "0.9", path="../ownedbytes" } ownedbytes = { version= "0.7", path="../ownedbytes" }
async-trait = "0.1" async-trait = "0.1"
time = { version = "0.3.10", features = ["serde-well-known"] } time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies] [dev-dependencies]
binggan = "0.14.0" binggan = "0.12.0"
proptest = "1.0.0" proptest = "1.0.0"
rand = "0.8.4" rand = "0.8.4"

View File

@@ -1,7 +1,7 @@
use binggan::{BenchRunner, black_box}; use binggan::{black_box, BenchRunner};
use rand::seq::IteratorRandom; use rand::seq::IteratorRandom;
use rand::thread_rng; use rand::thread_rng;
use tantivy_common::{BitSet, TinySet, serialize_vint_u32}; use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
fn bench_vint() { fn bench_vint() {
let mut runner = BenchRunner::new(); let mut runner = BenchRunner::new();
@@ -15,6 +15,7 @@ fn bench_vint() {
out += u64::from(buf[0]); out += u64::from(buf[0]);
} }
black_box(out); black_box(out);
None
}); });
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000); let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
@@ -26,6 +27,7 @@ fn bench_vint() {
out += u64::from(buf[0]); out += u64::from(buf[0]);
} }
black_box(out); black_box(out);
None
}); });
} }
@@ -41,20 +43,24 @@ fn bench_bitset() {
tinyset.pop_lowest(); tinyset.pop_lowest();
tinyset.pop_lowest(); tinyset.pop_lowest();
black_box(tinyset); black_box(tinyset);
None
}); });
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32); let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
runner.bench_function("bench_tinyset_sum", move |_| { runner.bench_function("bench_tinyset_sum", move |_| {
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32); assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
None
}); });
let v = [10u32, 14u32, 21u32]; let v = [10u32, 14u32, 21u32];
runner.bench_function("bench_tinyarr_sum", move |_| { runner.bench_function("bench_tinyarr_sum", move |_| {
black_box(v.iter().cloned().sum::<u32>()); black_box(v.iter().cloned().sum::<u32>());
None
}); });
runner.bench_function("bench_bitset_initialize", move |_| { runner.bench_function("bench_bitset_initialize", move |_| {
black_box(BitSet::with_max_value(1_000_000)); black_box(BitSet::with_max_value(1_000_000));
None
}); });
} }

View File

@@ -181,17 +181,9 @@ pub struct BitSet {
len: u64, len: u64,
max_value: u32, max_value: u32,
} }
impl std::fmt::Debug for BitSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BitSet")
.field("len", &self.len)
.field("max_value", &self.max_value)
.finish()
}
}
fn num_buckets(max_val: u32) -> u32 { fn num_buckets(max_val: u32) -> u32 {
max_val.div_ceil(64u32) (max_val + 63u32) / 64u32
} }
impl BitSet { impl BitSet {

View File

@@ -65,11 +65,11 @@ pub fn transform_bound_inner_res<TFrom, TTo>(
) -> io::Result<Bound<TTo>> { ) -> io::Result<Bound<TTo>> {
use self::Bound::*; use self::Bound::*;
Ok(match bound { Ok(match bound {
Excluded(from_val) => match transform(from_val)? { Excluded(ref from_val) => match transform(from_val)? {
TransformBound::NewBound(new_val) => new_val, TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Excluded(new_val), TransformBound::Existing(new_val) => Excluded(new_val),
}, },
Included(from_val) => match transform(from_val)? { Included(ref from_val) => match transform(from_val)? {
TransformBound::NewBound(new_val) => new_val, TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Included(new_val), TransformBound::Existing(new_val) => Included(new_val),
}, },
@@ -85,11 +85,11 @@ pub fn transform_bound_inner<TFrom, TTo>(
) -> Bound<TTo> { ) -> Bound<TTo> {
use self::Bound::*; use self::Bound::*;
match bound { match bound {
Excluded(from_val) => match transform(from_val) { Excluded(ref from_val) => match transform(from_val) {
TransformBound::NewBound(new_val) => new_val, TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Excluded(new_val), TransformBound::Existing(new_val) => Excluded(new_val),
}, },
Included(from_val) => match transform(from_val) { Included(ref from_val) => match transform(from_val) {
TransformBound::NewBound(new_val) => new_val, TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Included(new_val), TransformBound::Existing(new_val) => Included(new_val),
}, },
@@ -111,8 +111,8 @@ pub fn map_bound<TFrom, TTo>(
) -> Bound<TTo> { ) -> Bound<TTo> {
use self::Bound::*; use self::Bound::*;
match bound { match bound {
Excluded(from_val) => Bound::Excluded(transform(from_val)), Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
Included(from_val) => Bound::Included(transform(from_val)), Included(ref from_val) => Bound::Included(transform(from_val)),
Unbounded => Unbounded, Unbounded => Unbounded,
} }
} }
@@ -123,8 +123,8 @@ pub fn map_bound_res<TFrom, TTo, Err>(
) -> Result<Bound<TTo>, Err> { ) -> Result<Bound<TTo>, Err> {
use self::Bound::*; use self::Bound::*;
Ok(match bound { Ok(match bound {
Excluded(from_val) => Excluded(transform(from_val)?), Excluded(ref from_val) => Excluded(transform(from_val)?),
Included(from_val) => Included(transform(from_val)?), Included(ref from_val) => Included(transform(from_val)?),
Unbounded => Unbounded, Unbounded => Unbounded,
}) })
} }

View File

@@ -1,6 +1,5 @@
use std::fs::File; use std::fs::File;
use std::ops::{Deref, Range, RangeBounds}; use std::ops::{Deref, Range, RangeBounds};
use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};
@@ -74,7 +73,7 @@ impl FileHandle for WrapFile {
{ {
use std::io::{Read, Seek}; use std::io::{Read, Seek};
let mut file = self.file.try_clone()?; // Clone the file to read from it separately let mut file = self.file.try_clone()?; // Clone the file to read from it separately
// Seek to the start position in the file // Seek to the start position in the file
file.seek(io::SeekFrom::Start(start as u64))?; file.seek(io::SeekFrom::Start(start as u64))?;
// Read the data into the buffer // Read the data into the buffer
file.read_exact(&mut buffer)?; file.read_exact(&mut buffer)?;
@@ -178,12 +177,6 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
} }
impl FileSlice { impl FileSlice {
/// Creates a FileSlice from a path.
pub fn open(path: &Path) -> io::Result<FileSlice> {
let wrap_file = WrapFile::new(File::open(path)?)?;
Ok(FileSlice::new(Arc::new(wrap_file)))
}
/// Wraps a FileHandle. /// Wraps a FileHandle.
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self { pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
let num_bytes = file_handle.len(); let num_bytes = file_handle.len();
@@ -346,8 +339,8 @@ mod tests {
use std::sync::Arc; use std::sync::Arc;
use super::{FileHandle, FileSlice}; use super::{FileHandle, FileSlice};
use crate::HasLen;
use crate::file_slice::combine_ranges; use crate::file_slice::combine_ranges;
use crate::HasLen;
#[test] #[test]
fn test_file_slice() -> io::Result<()> { fn test_file_slice() -> io::Result<()> {

View File

@@ -22,7 +22,7 @@ pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref}; pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{ pub use vint::{
VInt, VIntU128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
}; };
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite}; pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
@@ -130,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
} }
#[cfg(test)] #[cfg(test)]
pub(crate) mod test { pub mod test {
use proptest::prelude::*; use proptest::prelude::*;
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
fn test_i64_converter_helper(val: i64) { fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val); assert_eq!(u64_to_i64(i64_to_u64(val)), val);
@@ -144,6 +144,12 @@ pub(crate) mod test {
assert_eq!(u64_to_f64(f64_to_u64(val)), val); assert_eq!(u64_to_f64(f64_to_u64(val)), val);
} }
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! { proptest! {
#[test] #[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) { fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
@@ -177,10 +183,8 @@ pub(crate) mod test {
#[test] #[test]
fn test_f64_order() { fn test_f64_order() {
assert!( assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)) .contains(&f64_to_u64(f64::NAN))); // nan is not a number
.contains(&f64_to_u64(f64::NAN))
); // nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa

View File

@@ -74,14 +74,14 @@ impl FixedSize for () {
impl<T: BinarySerializable> BinarySerializable for Vec<T> { impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?; VInt(self.len() as u64).serialize(writer)?;
for it in self { for it in self {
it.serialize(writer)?; it.serialize(writer)?;
} }
Ok(()) Ok(())
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val(); let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize); let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items { for _ in 0..num_items {
let item = T::deserialize(reader)?; let item = T::deserialize(reader)?;
@@ -236,12 +236,12 @@ impl FixedSize for bool {
impl BinarySerializable for String { impl BinarySerializable for String {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes(); let data: &[u8] = self.as_bytes();
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?; VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data) writer.write_all(data)
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize; let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length); let mut result = String::with_capacity(string_length);
reader reader
.take(string_length as u64) .take(string_length as u64)
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
impl<'a> BinarySerializable for Cow<'a, str> { impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes(); let data: &[u8] = self.as_bytes();
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?; VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data) writer.write_all(data)
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize; let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length); let mut result = String::with_capacity(string_length);
reader reader
.take(string_length as u64) .take(string_length as u64)
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
impl<'a> BinarySerializable for Cow<'a, [u8]> { impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?; VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() { for it in self.iter() {
BinarySerializable::serialize(it, writer)?; it.serialize(writer)?;
} }
Ok(()) Ok(())
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val(); let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize); let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items { for _ in 0..num_items {
let item = <u8 as BinarySerializable>::deserialize(reader)?; let item = u8::deserialize(reader)?;
items.push(item); items.push(item);
} }
Ok(Cow::Owned(items)) Ok(Cow::Owned(items))

View File

@@ -28,9 +28,7 @@ impl BinarySerializable for VIntU128 {
writer.write_all(&buffer) writer.write_all(&buffer)
} }
#[allow(clippy::unbuffered_bytes)]
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes(); let mut bytes = reader.bytes();
let mut result = 0u128; let mut result = 0u128;
let mut shift = 0u64; let mut shift = 0u64;
@@ -197,9 +195,7 @@ impl BinarySerializable for VInt {
writer.write_all(&buffer[0..num_bytes]) writer.write_all(&buffer[0..num_bytes])
} }
#[allow(clippy::unbuffered_bytes)]
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes(); let mut bytes = reader.bytes();
let mut result = 0u64; let mut result = 0u64;
let mut shift = 0u64; let mut shift = 0u64;
@@ -226,7 +222,7 @@ impl BinarySerializable for VInt {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{BinarySerializable, VInt, serialize_vint_u32}; use super::{serialize_vint_u32, BinarySerializable, VInt};
fn aux_test_vint(val: u64) { fn aux_test_vint(val: u64) {
let mut v = [14u8; 10]; let mut v = [14u8; 10];

View File

@@ -87,7 +87,7 @@ impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
} }
} }
impl TerminatingWrite for &mut Vec<u8> { impl<'a> TerminatingWrite for &'a mut Vec<u8> {
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> { fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
self.flush() self.flush()
} }

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.4 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 653 KiB

View File

@@ -2,7 +2,7 @@
> Tantivy is a **search** engine **library** for Rust. > Tantivy is a **search** engine **library** for Rust.
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
they both have the same scope and targeted use cases. they both have the same scope and targeted use cases.
If you are not familiar with Lucene, let's break down our little tagline. If you are not familiar with Lucene, let's break down our little tagline.
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
experience. But keep in mind this is just a toolbox. experience. But keep in mind this is just a toolbox.
Which bring us to the second keyword... Which bring us to the second keyword...
- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance. - **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
Sometimes a functionality will not be available in tantivy because it is too Sometimes a functionality will not be available in tantivy because it is too
specific to your use case. By design, tantivy should make it possible to extend specific to your use case. By design, tantivy should make it possible to extend
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
index from a different format. index from a different format.
Tantivy exposes a lot of low level API to do all of these things. Tantivy exposes a lot of low level API to do all of these things.

View File

@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
While this design has some downsides, this greatly simplifies the source code of While this design has some downsides, this greatly simplifies the source code of
tantivy. Caching is also entirely delegated to the OS. tantivy. Caching is also entirely delegated to the OS.
Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds. `tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case. This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.

View File

@@ -31,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents. When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids. E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
Note: tantivy 0.16 does not do this optimization yet. Note: Tantivy 0.16 does not do this optimization yet.
### Pruning ### Pruning
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter. Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
Note: tantivy 0.16 does not do this optimization yet. Note: Tantivy 0.16 does not do this optimization yet.
### Other? ### Other?
@@ -45,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i
## Usage ## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used. The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
```rust ```rust
let settings = IndexSettings { let settings = IndexSettings {

View File

@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
- `value`: The value representation is just the regular Value representation. - `value`: The value representation is just the regular Value representation.
This representation is designed to align the natural sort of Terms with the lexicographical sort This representation is designed to align the natural sort of Terms with the lexicographical sort
of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding). of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
In the example above, the terms will be sorted as In the example above, the terms will be sorted as

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
// Our second field is body. // Our second field is body.
// We want full-text search for it, but we do not // We want full-text search for it, but we do not
// need to be able to retrieve it // need to be able to be able to retrieve it
// for our application. // for our application.
// //
// We can make our index lighter by omitting the `STORED` flag. // We can make our index lighter by omitting the `STORED` flag.
@@ -208,7 +208,7 @@ fn main() -> tantivy::Result<()> {
// is the role of the `TopDocs` collector. // is the role of the `TopDocs` collector.
// We can now perform our query. // We can now perform our query.
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
// The actual documents still need to be // The actual documents still need to be
// retrieved from Tantivy's store. // retrieved from Tantivy's store.
@@ -226,7 +226,7 @@ fn main() -> tantivy::Result<()> {
let query = query_parser.parse_query("title:sea^20 body:whale^70")?; let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
let (_score, doc_address) = searcher let (_score, doc_address) = searcher
.search(&query, &TopDocs::with_limit(1).order_by_score())? .search(&query, &TopDocs::with_limit(1))?
.into_iter() .into_iter()
.next() .next()
.unwrap(); .unwrap();

View File

@@ -100,7 +100,7 @@ fn main() -> tantivy::Result<()> {
// here we want to get a hit on the 'ken' in Frankenstein // here we want to get a hit on the 'ken' in Frankenstein
let query = query_parser.parse_query("ken")?; let query = query_parser.parse_query("ken")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs { for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;

Some files were not shown because too many files have changed in this diff Show More