Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
7b06db062b Adding a special method to Arc<dyn ColumnValues> 2023-02-14 23:14:14 +09:00
329 changed files with 11017 additions and 31014 deletions

View File

@@ -3,23 +3,20 @@ name: Coverage
on:
push:
branches: [main]
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
pull_request:
branches: [main]
jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly-2023-09-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2023-09-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -8,18 +8,13 @@ env:
CARGO_TERM_COLOR: always
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1
with:

View File

@@ -9,18 +9,13 @@ on:
env:
CARGO_TERM_COLOR: always
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install nightly
uses: actions-rs/toolchain@v1
@@ -39,13 +34,6 @@ jobs:
- name: Check Formatting
run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation
run: cargo build --all-features
- name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features
- uses: actions-rs/clippy-check@v1
with:
@@ -60,14 +48,14 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]
name: test-${{ matrix.features.label}}
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1

2
.gitignore vendored
View File

@@ -13,5 +13,3 @@ benchmark
.idea
trace.dat
cargo-timing*
control
variable

View File

@@ -254,7 +254,7 @@ The token positions of all of the terms are then stored in a separate file with
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate through the docset,
we advance the position reader by the number of term frequencies of the current document.
## [fieldnorm/](src/fieldnorm): Here is my doc, how many tokens in this field?
## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
The [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) formula also requires to know the number of tokens stored in a specific field for a given document. We store this information on one byte per document in the fieldnorm.
The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged.

View File

@@ -1,125 +1,3 @@
Tantivy 0.21.1
================================
#### Bugfixes
- Range queries on fast fields with less values on that field than documents had an invalid end condition, leading to missing results. [#2226](https://github.com/quickwit-oss/tantivy/issues/2226)(@appaquet @PSeitz)
- Increase the minimum memory budget from 3MB to 15MB to avoid single doc segments (API fix). [#2176](https://github.com/quickwit-oss/tantivy/issues/2176)(@PSeitz)
Tantivy 0.21
================================
#### Bugfixes
- Fix track fast field memory consumption, which led to higher memory consumption than the budget allowed during indexing [#2148](https://github.com/quickwit-oss/tantivy/issues/2148)[#2147](https://github.com/quickwit-oss/tantivy/issues/2147)(@PSeitz)
- Fix a regression from 0.20 where sort index by date wasn't working anymore [#2124](https://github.com/quickwit-oss/tantivy/issues/2124)(@PSeitz)
- Fix getting the root facet on the `FacetCollector`. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086)(@adamreichold)
- Align numerical type priority order of columnar and query. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088)(@fmassot)
#### Breaking Changes
- Remove support for Brotli and Snappy compression [#2123](https://github.com/quickwit-oss/tantivy/issues/2123)(@adamreichold)
#### Features/Improvements
- Implement lenient query parser [#2129](https://github.com/quickwit-oss/tantivy/pull/2129)(@trinity-1686a)
- order_by_u64_field and order_by_fast_field allow sorting in ascending and descending order [#2111](https://github.com/quickwit-oss/tantivy/issues/2111)(@naveenann)
- Allow dynamic filters in text analyzer builder [#2110](https://github.com/quickwit-oss/tantivy/issues/2110)(@fulmicoton @fmassot)
- **Aggregation**
- Add missing parameter for term aggregation [#2149](https://github.com/quickwit-oss/tantivy/issues/2149)[#2103](https://github.com/quickwit-oss/tantivy/issues/2103)(@PSeitz)
- Add missing parameter for percentiles [#2157](https://github.com/quickwit-oss/tantivy/issues/2157)(@PSeitz)
- Add missing parameter for stats,min,max,count,sum,avg [#2151](https://github.com/quickwit-oss/tantivy/issues/2151)(@PSeitz)
- Improve aggregation deserialization error message [#2150](https://github.com/quickwit-oss/tantivy/issues/2150)(@PSeitz)
- Add validation for type Bytes to term_agg [#2077](https://github.com/quickwit-oss/tantivy/issues/2077)(@PSeitz)
- Alternative mixed field collection [#2135](https://github.com/quickwit-oss/tantivy/issues/2135)(@PSeitz)
- Add missing query_terms impl for TermSetQuery. [#2120](https://github.com/quickwit-oss/tantivy/issues/2120)(@adamreichold)
- Minor improvements to OwnedBytes [#2134](https://github.com/quickwit-oss/tantivy/issues/2134)(@adamreichold)
- Remove allocations in split compound words [#2080](https://github.com/quickwit-oss/tantivy/issues/2080)(@PSeitz)
- Ngram tokenizer now returns an error with invalid arguments [#2102](https://github.com/quickwit-oss/tantivy/issues/2102)(@fmassot)
- Make TextAnalyzerBuilder public [#2097](https://github.com/quickwit-oss/tantivy/issues/2097)(@adamreichold)
- Return an error when tokenizer is not found while indexing [#2093](https://github.com/quickwit-oss/tantivy/issues/2093)(@naveenann)
- Delayed column opening during merge [#2132](https://github.com/quickwit-oss/tantivy/issues/2132)(@PSeitz)
Tantivy 0.20.2
================================
- Align numerical type priority order on the search side. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
Tantivy 0.20.1
================================
- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
Tantivy 0.20
================================
#### Bugfixes
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
- Handle error for exists on MMapDirectory [#1988](https://github.com/quickwit-oss/tantivy/issues/1988) (@PSeitz)
- Aggregation
- Fix min doc_count empty merge bug [#2057](https://github.com/quickwit-oss/tantivy/issues/2057) (@PSeitz)
- Fix: Sort order for term aggregations (sort order on key was inverted) [#1858](https://github.com/quickwit-oss/tantivy/issues/1858) (@PSeitz)
#### Features/Improvements
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
- Unified access for fast fields over different cardinalities.
- Unified storage for typed and untyped fields.
- Move fastfield codecs into columnar. [#1782](https://github.com/quickwit-oss/tantivy/issues/1782) (@fulmicoton)
- Sparse dense index for optional values [#1716](https://github.com/quickwit-oss/tantivy/issues/1716) (@PSeitz)
- Switch to nanosecond precision in DateTime fastfield [#2016](https://github.com/quickwit-oss/tantivy/issues/2016) (@PSeitz)
- **Aggregation**
- Add `date_histogram` aggregation (only `fixed_interval` for now) [#1900](https://github.com/quickwit-oss/tantivy/issues/1900) (@PSeitz)
- Add `percentiles` aggregations [#1984](https://github.com/quickwit-oss/tantivy/issues/1984) (@PSeitz)
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
- Reduce term aggregation memory consumption [#2013](https://github.com/quickwit-oss/tantivy/issues/2013) (@PSeitz)
- Reduce agg memory consumption: Replace generic aggregation collector (which has a high memory requirement per instance) in aggregation tree with optimized versions behind a trait.
- Split term collection count and sub_agg (Faster term agg with less memory consumption for cases without sub-aggs) [#1921](https://github.com/quickwit-oss/tantivy/issues/1921) (@PSeitz)
- Schemaless aggregations: In combination with stacker tantivy supports now schemaless aggregations via the JSON type.
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
- Faster indexing
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
- Faster search
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
- Faster fast field range queries using SIMD [#1954](https://github.com/quickwit-oss/tantivy/issues/1954) (@fulmicoton)
- Improve fast field range query performance [#1864](https://github.com/quickwit-oss/tantivy/issues/1864) (@PSeitz)
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
- sstable
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
- Query Parser
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
- PhrasePrefixQuery is supported in the query parser via: `field:"phrase ter"*` [#2044](https://github.com/quickwit-oss/tantivy/issues/2044) (@adamreichold)
- Docs
- Update examples for literate docs [#1880](https://github.com/quickwit-oss/tantivy/issues/1880) (@PSeitz)
- Add ip field example [#1775](https://github.com/quickwit-oss/tantivy/issues/1775) (@PSeitz)
- Fix doc store cache documentation [#1821](https://github.com/quickwit-oss/tantivy/issues/1821) (@PSeitz)
- Fix BooleanQuery document [#1999](https://github.com/quickwit-oss/tantivy/issues/1999) (@RT_Enzyme)
- Update comments in the faceted search example [#1737](https://github.com/quickwit-oss/tantivy/issues/1737) (@DawChihLiou)
Tantivy 0.19
================================
#### Bugfixes

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.21.0"
version = "0.19.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,7 +12,6 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.62"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
oneshot = "0.1.5"
@@ -21,49 +20,48 @@ byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
aho-corasick = "0.7"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
num_cpus = "1.13.1"
fs4 = { version = "0.7.0", optional = true }
fs2 = { version = "0.4.3", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { git = "https://github.com/quickwit-oss/bitpacking", rev = "f730b75", default-features = false, features = ["bitpacker4x"] }
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
murmurhash32 = "0.3.0"
fail = "0.5.0"
murmurhash32 = "0.2.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
lru = "0.9.0"
fastdivide = "0.4.0"
itertools = "0.11.0"
itertools = "0.10.3"
measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
sstable = { version= "0.2", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version= "0.2", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.21.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.5", path="./bitpacker" }
common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -74,16 +72,11 @@ maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
criterion = "0.4"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.4.3"
[target.'cfg(not(windows))'.dev-dependencies]
criterion = "0.5"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
[dev-dependencies.fail]
version = "0.5.0"
@@ -94,32 +87,24 @@ opt-level = 3
debug = false
debug-assertions = false
[profile.bench]
opt-level = 3
debug = true
debug-assertions = false
[profile.test]
debug-assertions = true
overflow-checks = true
[features]
default = ["mmap", "stopwords", "lz4-compression"]
mmap = ["fs4", "tempfile", "memmap2"]
mmap = ["fs2", "tempfile", "memmap2"]
stopwords = []
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
quickwit = ["sstable"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
@@ -134,7 +119,7 @@ members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sst
[[test]]
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["failpoints"]
required-features = ["fail/failpoints"]
[[bench]]
name = "analyzer"
@@ -143,3 +128,4 @@ harness = false
[[bench]]
name = "index-bench"
harness = false

View File

@@ -1,5 +1,5 @@
test:
@echo "Run test only... No examples."
echo "Run test only... No examples."
cargo test --tests --lib
fmt:

View File

@@ -26,8 +26,6 @@ Your mileage WILL vary depending on the nature of queries and their load.
<img src="doc/assets/images/searchbenchmark.png">
Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
# Features
- Full-text search
@@ -44,7 +42,7 @@ Details about the benchmark can be found at this [repository](https://github.com
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None)
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing)

View File

@@ -1,21 +0,0 @@
# Release a new Tantivy Version
## Steps
1. Identify new packages in workspace since last release
2. Identify changed packages in workspace since last release
3. Bump version in `Cargo.toml` and their dependents for all changed packages
4. Update version of root `Cargo.toml`
5. Publish version starting with leaf nodes
6. Set git tag with new version
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
Set new packages to version 0.0.0
Replace prev-tag-name
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
```
no-tag or it will create tags for all the subpackages

23
appveyor.yml Normal file
View File

@@ -0,0 +1,23 @@
# Appveyor configuration template for Rust using rustup for Rust installation
# https://github.com/starkat99/appveyor-rust
os: Visual Studio 2015
environment:
matrix:
- channel: stable
target: x86_64-pc-windows-msvc
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain %channel% --default-host %target%
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
- rustc -vV
- cargo -vV
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
- REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -1,13 +1,11 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::{
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let mut tokenizer = tokenizer_manager.get("default").unwrap();
let tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
@@ -18,26 +16,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic()
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser)
.build();
c.bench_function("dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

File diff suppressed because one or more lines are too long

View File

@@ -1,15 +1,10 @@
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use criterion::{criterion_group, criterion_main, Criterion};
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json");
const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json");
fn get_lines(input: &str) -> Vec<&str> {
input.trim().split('\n').collect()
}
const NUM_REPEATS: usize = 2;
pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = {
@@ -33,152 +28,85 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
};
let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
}
pub fn gh_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| {
let lines = get_lines(GH_LOGS);
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-gh-with-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
}
pub fn wiki_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-wiki");
group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-wiki-with-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
@@ -187,17 +115,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
criterion_group! {
name = benches;
config = Criterion::default();
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = hdfs_index_benchmark
}
criterion_group! {
name = gh_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark
}
criterion_group! {
name = wiki_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark
}
criterion_main!(benches, gh_benches, wiki_benches);
criterion_main!(benches);

File diff suppressed because one or more lines are too long

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.5.0"
version = "0.3.0"
edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
@@ -15,7 +15,6 @@ homepage = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
bitpacking = {version="0.8", default-features=false, features = ["bitpacker1x"]}
[dev-dependencies]
rand = "0.8"

View File

@@ -1,14 +1,10 @@
use std::convert::TryInto;
use std::io;
use std::ops::{Range, RangeInclusive};
use bitpacking::{BitPacker as ExternalBitPackerTrait, BitPacker1x};
pub struct BitPacker {
mini_buffer: u64,
mini_buffer_written: usize,
}
impl Default for BitPacker {
fn default() -> Self {
BitPacker::new()
@@ -122,125 +118,6 @@ impl BitUnpacker {
let val_shifted = val_unshifted_unmasked >> bit_shift;
val_shifted & self.mask
}
// Decodes the range of bitpacked `u32` values with idx
// in [start_idx, start_idx + output.len()).
//
// #Panics
//
// This methods panics if `num_bits` is > 32.
fn get_batch_u32s(&self, start_idx: u32, data: &[u8], output: &mut [u32]) {
assert!(
self.bit_width() <= 32,
"Bitwidth must be <= 32 to use this method."
);
let end_idx = start_idx + output.len() as u32;
let end_bit_read = end_idx * self.num_bits;
let end_byte_read = (end_bit_read + 7) / 8;
assert!(
end_byte_read as usize <= data.len(),
"Requested index is out of bounds."
);
// Simple slow implementation of get_batch_u32s, to deal with our ramps.
let get_batch_ramp = |start_idx: u32, output: &mut [u32]| {
for (out, idx) in output.iter_mut().zip(start_idx..) {
*out = self.get(idx, data) as u32;
}
};
// We use an unrolled routine to decode 32 values at once.
// We therefore decompose our range of values to decode into three ranges:
// - Entrance ramp: [start_idx, fast_track_start) (up to 31 values)
// - Highway: [fast_track_start, fast_track_end) (a length multiple of 32s)
// - Exit ramp: [fast_track_end, start_idx + output.len()) (up to 31 values)
// We want the start of the fast track to start align with bytes.
// A sufficient condition is to start with an idx that is a multiple of 8,
// so highway start is the closest multiple of 8 that is >= start_idx.
let entrance_ramp_len = 8 - (start_idx % 8) % 8;
let highway_start: u32 = start_idx + entrance_ramp_len;
if highway_start + BitPacker1x::BLOCK_LEN as u32 > end_idx {
// We don't have enough values to have even a single block of highway.
// Let's just supply the values the simple way.
get_batch_ramp(start_idx, output);
return;
}
let num_blocks: u32 = (end_idx - highway_start) / BitPacker1x::BLOCK_LEN as u32;
// Entrance ramp
get_batch_ramp(start_idx, &mut output[..entrance_ramp_len as usize]);
// Highway
let mut offset = (highway_start * self.num_bits) as usize / 8;
let mut output_cursor = (highway_start - start_idx) as usize;
for _ in 0..num_blocks {
offset += BitPacker1x.decompress(
&data[offset..],
&mut output[output_cursor..],
self.num_bits as u8,
);
output_cursor += 32;
}
// Exit ramp
let highway_end = highway_start + num_blocks * BitPacker1x::BLOCK_LEN as u32;
get_batch_ramp(highway_end, &mut output[output_cursor..]);
}
pub fn get_ids_for_value_range(
&self,
range: RangeInclusive<u64>,
id_range: Range<u32>,
data: &[u8],
positions: &mut Vec<u32>,
) {
if self.bit_width() > 32 {
self.get_ids_for_value_range_slow(range, id_range, data, positions)
} else {
if *range.start() > u32::MAX as u64 {
positions.clear();
return;
}
let range_u32 = (*range.start() as u32)..=(*range.end()).min(u32::MAX as u64) as u32;
self.get_ids_for_value_range_fast(range_u32, id_range, data, positions)
}
}
fn get_ids_for_value_range_slow(
&self,
range: RangeInclusive<u64>,
id_range: Range<u32>,
data: &[u8],
positions: &mut Vec<u32>,
) {
positions.clear();
for i in id_range {
// If we cared we could make this branchless, but the slow implementation should rarely
// kick in.
let val = self.get(i, data);
if range.contains(&val) {
positions.push(i);
}
}
}
fn get_ids_for_value_range_fast(
&self,
value_range: RangeInclusive<u32>,
id_range: Range<u32>,
data: &[u8],
positions: &mut Vec<u32>,
) {
positions.resize(id_range.len(), 0u32);
self.get_batch_u32s(id_range.start, data, positions);
crate::filter_vec::filter_vec_in_place(value_range, id_range.start, positions)
}
}
#[cfg(test)]
@@ -323,58 +200,4 @@ mod test {
test_bitpacker_aux(num_bits, &vals);
}
}
#[test]
#[should_panic]
fn test_get_batch_panics_over_32_bits() {
let bitunpacker = BitUnpacker::new(33);
let mut output: [u32; 1] = [0u32];
bitunpacker.get_batch_u32s(0, &[0, 0, 0, 0, 0, 0, 0, 0], &mut output[..]);
}
#[test]
fn test_get_batch_limit() {
let bitunpacker = BitUnpacker::new(1);
let mut output: [u32; 3] = [0u32, 0u32, 0u32];
bitunpacker.get_batch_u32s(8 * 4 - 3, &[0u8, 0u8, 0u8, 0u8], &mut output[..]);
}
#[test]
#[should_panic]
fn test_get_batch_panics_when_off_scope() {
let bitunpacker = BitUnpacker::new(1);
let mut output: [u32; 3] = [0u32, 0u32, 0u32];
// We are missing exactly one bit.
bitunpacker.get_batch_u32s(8 * 4 - 2, &[0u8, 0u8, 0u8, 0u8], &mut output[..]);
}
proptest::proptest! {
#[test]
fn test_get_batch_u32s_proptest(num_bits in 0u8..=32u8) {
let mask =
if num_bits == 32u8 {
u32::MAX
} else {
(1u32 << num_bits) - 1
};
let mut buffer: Vec<u8> = Vec::new();
let mut bitpacker = BitPacker::new();
for val in 0..100 {
bitpacker.write(val & mask as u64, num_bits, &mut buffer).unwrap();
}
bitpacker.flush(&mut buffer).unwrap();
let bitunpacker = BitUnpacker::new(num_bits);
let mut output: Vec<u32> = Vec::new();
for len in [0, 1, 2, 32, 33, 34, 64] {
for start_idx in 0u32..32u32 {
output.resize(len, 0);
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
for i in 0..len {
let expected = (start_idx + i as u32) & mask;
assert_eq!(output[i], expected);
}
}
}
}
}
}

View File

@@ -64,8 +64,10 @@ fn mem_usage<T>(items: &Vec<T>) -> usize {
impl BlockedBitpacker {
pub fn new() -> Self {
let mut compressed_blocks = vec![];
compressed_blocks.resize(8, 0);
Self {
compressed_blocks: vec![0; 8],
compressed_blocks,
buffer: vec![],
offset_and_bits: vec![],
}

View File

@@ -1,365 +0,0 @@
//! SIMD filtering of a vector as described in the following blog post.
//! <https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512>
use std::arch::x86_64::{
__m256i as DataType, _mm256_add_epi32 as op_add, _mm256_cmpgt_epi32 as op_greater,
_mm256_lddqu_si256 as load_unaligned, _mm256_or_si256 as op_or, _mm256_set1_epi32 as set1,
_mm256_storeu_si256 as store_unaligned, _mm256_xor_si256 as op_xor, *,
};
use std::ops::RangeInclusive;
const NUM_LANES: usize = 8;
const HIGHEST_BIT: u32 = 1 << 31;
#[inline]
fn u32_to_i32(val: u32) -> i32 {
(val ^ HIGHEST_BIT) as i32
}
#[inline]
unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType {
const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]);
op_xor(vals_u32x8s, HIGHEST_BIT_MASK)
}
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
// We use a monotonic mapping from u32 to i32 to make the comparison possible in AVX2.
let range_i32: RangeInclusive<i32> = u32_to_i32(*range.start())..=u32_to_i32(*range.end());
let num_words = output.len() / NUM_LANES;
let mut output_len = unsafe {
filter_vec_avx2_aux(
output.as_ptr() as *const __m256i,
range_i32,
output.as_mut_ptr(),
offset,
num_words,
)
};
let reminder_start = num_words * NUM_LANES;
for i in reminder_start..output.len() {
let val = output[i];
output[output_len] = offset + i as u32;
output_len += if range.contains(&val) { 1 } else { 0 };
}
output.truncate(output_len);
}
#[target_feature(enable = "avx2")]
unsafe fn filter_vec_avx2_aux(
mut input: *const __m256i,
range: RangeInclusive<i32>,
output: *mut u32,
offset: u32,
num_words: usize,
) -> usize {
let mut output_tail = output;
let range_simd = set1(*range.start())..=set1(*range.end());
let mut ids = from_u32x8([
offset,
offset + 1,
offset + 2,
offset + 3,
offset + 4,
offset + 5,
offset + 6,
offset + 7,
]);
const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]);
for _ in 0..num_words {
let word = load_unaligned(input);
let word = u32_to_i32_avx2(word);
let keeper_bitset = compute_filter_bitset(word, range_simd.clone());
let added_len = keeper_bitset.count_ones();
let filtered_doc_ids = compact(ids, keeper_bitset);
store_unaligned(output_tail as *mut __m256i, filtered_doc_ids);
output_tail = output_tail.offset(added_len as isize);
ids = op_add(ids, SHIFT);
input = input.offset(1);
}
output_tail.offset_from(output) as usize
}
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn compact(data: DataType, mask: u8) -> DataType {
let vperm_mask = MASK_TO_PERMUTATION[mask as usize];
_mm256_permutevar8x32_epi32(data, vperm_mask)
}
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn compute_filter_bitset(val: __m256i, range: std::ops::RangeInclusive<__m256i>) -> u8 {
let too_low = op_greater(*range.start(), val);
let too_high = op_greater(val, *range.end());
let inside = op_or(too_low, too_high);
255 - std::arch::x86_64::_mm256_movemask_ps(std::mem::transmute::<DataType, __m256>(inside))
as u8
}
union U8x32 {
vector: DataType,
vals: [u32; NUM_LANES],
}
const fn from_u32x8(vals: [u32; NUM_LANES]) -> DataType {
unsafe { U8x32 { vals }.vector }
}
const MASK_TO_PERMUTATION: [DataType; 256] = [
from_u32x8([0, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 0, 0, 0, 0, 0, 0]),
from_u32x8([2, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 2, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 0, 0, 0, 0, 0]),
from_u32x8([3, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 3, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 0, 0, 0, 0, 0]),
from_u32x8([2, 3, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 0, 0, 0, 0, 0]),
from_u32x8([1, 2, 3, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 0, 0, 0, 0]),
from_u32x8([4, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 4, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 0, 0, 0, 0, 0]),
from_u32x8([2, 4, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 0, 0, 0, 0, 0]),
from_u32x8([1, 2, 4, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 0, 0, 0, 0]),
from_u32x8([3, 4, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 0, 0, 0, 0, 0]),
from_u32x8([1, 3, 4, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 0, 0, 0, 0]),
from_u32x8([2, 3, 4, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 0, 0, 0, 0]),
from_u32x8([1, 2, 3, 4, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 0, 0, 0]),
from_u32x8([5, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 5, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 5, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 5, 0, 0, 0, 0, 0]),
from_u32x8([2, 5, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 5, 0, 0, 0, 0, 0]),
from_u32x8([1, 2, 5, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 5, 0, 0, 0, 0]),
from_u32x8([3, 5, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 5, 0, 0, 0, 0, 0]),
from_u32x8([1, 3, 5, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 5, 0, 0, 0, 0]),
from_u32x8([2, 3, 5, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 5, 0, 0, 0, 0]),
from_u32x8([1, 2, 3, 5, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 5, 0, 0, 0]),
from_u32x8([4, 5, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 5, 0, 0, 0, 0, 0]),
from_u32x8([1, 4, 5, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 5, 0, 0, 0, 0]),
from_u32x8([2, 4, 5, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 5, 0, 0, 0, 0]),
from_u32x8([1, 2, 4, 5, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 5, 0, 0, 0]),
from_u32x8([3, 4, 5, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 5, 0, 0, 0, 0]),
from_u32x8([1, 3, 4, 5, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 5, 0, 0, 0]),
from_u32x8([2, 3, 4, 5, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 5, 0, 0, 0]),
from_u32x8([1, 2, 3, 4, 5, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 5, 0, 0]),
from_u32x8([6, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 6, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 6, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 6, 0, 0, 0, 0, 0]),
from_u32x8([2, 6, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 6, 0, 0, 0, 0, 0]),
from_u32x8([1, 2, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 6, 0, 0, 0, 0]),
from_u32x8([3, 6, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 6, 0, 0, 0, 0, 0]),
from_u32x8([1, 3, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 6, 0, 0, 0, 0]),
from_u32x8([2, 3, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 6, 0, 0, 0, 0]),
from_u32x8([1, 2, 3, 6, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 6, 0, 0, 0]),
from_u32x8([4, 6, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 6, 0, 0, 0, 0, 0]),
from_u32x8([1, 4, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 6, 0, 0, 0, 0]),
from_u32x8([2, 4, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 6, 0, 0, 0, 0]),
from_u32x8([1, 2, 4, 6, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 6, 0, 0, 0]),
from_u32x8([3, 4, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 6, 0, 0, 0, 0]),
from_u32x8([1, 3, 4, 6, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 6, 0, 0, 0]),
from_u32x8([2, 3, 4, 6, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 6, 0, 0, 0]),
from_u32x8([1, 2, 3, 4, 6, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 6, 0, 0]),
from_u32x8([5, 6, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 5, 6, 0, 0, 0, 0, 0]),
from_u32x8([1, 5, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 5, 6, 0, 0, 0, 0]),
from_u32x8([2, 5, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 5, 6, 0, 0, 0, 0]),
from_u32x8([1, 2, 5, 6, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 5, 6, 0, 0, 0]),
from_u32x8([3, 5, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 5, 6, 0, 0, 0, 0]),
from_u32x8([1, 3, 5, 6, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 5, 6, 0, 0, 0]),
from_u32x8([2, 3, 5, 6, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 5, 6, 0, 0, 0]),
from_u32x8([1, 2, 3, 5, 6, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 5, 6, 0, 0]),
from_u32x8([4, 5, 6, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 5, 6, 0, 0, 0, 0]),
from_u32x8([1, 4, 5, 6, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 5, 6, 0, 0, 0]),
from_u32x8([2, 4, 5, 6, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 5, 6, 0, 0, 0]),
from_u32x8([1, 2, 4, 5, 6, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 5, 6, 0, 0]),
from_u32x8([3, 4, 5, 6, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 5, 6, 0, 0, 0]),
from_u32x8([1, 3, 4, 5, 6, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 5, 6, 0, 0]),
from_u32x8([2, 3, 4, 5, 6, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 5, 6, 0, 0]),
from_u32x8([1, 2, 3, 4, 5, 6, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 5, 6, 0]),
from_u32x8([7, 0, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([1, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 7, 0, 0, 0, 0, 0]),
from_u32x8([2, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 7, 0, 0, 0, 0, 0]),
from_u32x8([1, 2, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 7, 0, 0, 0, 0]),
from_u32x8([3, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 7, 0, 0, 0, 0, 0]),
from_u32x8([1, 3, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 7, 0, 0, 0, 0]),
from_u32x8([2, 3, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 7, 0, 0, 0, 0]),
from_u32x8([1, 2, 3, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 7, 0, 0, 0]),
from_u32x8([4, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 7, 0, 0, 0, 0, 0]),
from_u32x8([1, 4, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 7, 0, 0, 0, 0]),
from_u32x8([2, 4, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 7, 0, 0, 0, 0]),
from_u32x8([1, 2, 4, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 7, 0, 0, 0]),
from_u32x8([3, 4, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 7, 0, 0, 0, 0]),
from_u32x8([1, 3, 4, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 7, 0, 0, 0]),
from_u32x8([2, 3, 4, 7, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 7, 0, 0, 0]),
from_u32x8([1, 2, 3, 4, 7, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 7, 0, 0]),
from_u32x8([5, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 5, 7, 0, 0, 0, 0, 0]),
from_u32x8([1, 5, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 5, 7, 0, 0, 0, 0]),
from_u32x8([2, 5, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 5, 7, 0, 0, 0, 0]),
from_u32x8([1, 2, 5, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 5, 7, 0, 0, 0]),
from_u32x8([3, 5, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 5, 7, 0, 0, 0, 0]),
from_u32x8([1, 3, 5, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 5, 7, 0, 0, 0]),
from_u32x8([2, 3, 5, 7, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 5, 7, 0, 0, 0]),
from_u32x8([1, 2, 3, 5, 7, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 5, 7, 0, 0]),
from_u32x8([4, 5, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 5, 7, 0, 0, 0, 0]),
from_u32x8([1, 4, 5, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 5, 7, 0, 0, 0]),
from_u32x8([2, 4, 5, 7, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 5, 7, 0, 0, 0]),
from_u32x8([1, 2, 4, 5, 7, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 5, 7, 0, 0]),
from_u32x8([3, 4, 5, 7, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 5, 7, 0, 0, 0]),
from_u32x8([1, 3, 4, 5, 7, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 5, 7, 0, 0]),
from_u32x8([2, 3, 4, 5, 7, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 5, 7, 0, 0]),
from_u32x8([1, 2, 3, 4, 5, 7, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 5, 7, 0]),
from_u32x8([6, 7, 0, 0, 0, 0, 0, 0]),
from_u32x8([0, 6, 7, 0, 0, 0, 0, 0]),
from_u32x8([1, 6, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 1, 6, 7, 0, 0, 0, 0]),
from_u32x8([2, 6, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 2, 6, 7, 0, 0, 0, 0]),
from_u32x8([1, 2, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 2, 6, 7, 0, 0, 0]),
from_u32x8([3, 6, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 3, 6, 7, 0, 0, 0, 0]),
from_u32x8([1, 3, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 3, 6, 7, 0, 0, 0]),
from_u32x8([2, 3, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 2, 3, 6, 7, 0, 0, 0]),
from_u32x8([1, 2, 3, 6, 7, 0, 0, 0]),
from_u32x8([0, 1, 2, 3, 6, 7, 0, 0]),
from_u32x8([4, 6, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 4, 6, 7, 0, 0, 0, 0]),
from_u32x8([1, 4, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 4, 6, 7, 0, 0, 0]),
from_u32x8([2, 4, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 2, 4, 6, 7, 0, 0, 0]),
from_u32x8([1, 2, 4, 6, 7, 0, 0, 0]),
from_u32x8([0, 1, 2, 4, 6, 7, 0, 0]),
from_u32x8([3, 4, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 3, 4, 6, 7, 0, 0, 0]),
from_u32x8([1, 3, 4, 6, 7, 0, 0, 0]),
from_u32x8([0, 1, 3, 4, 6, 7, 0, 0]),
from_u32x8([2, 3, 4, 6, 7, 0, 0, 0]),
from_u32x8([0, 2, 3, 4, 6, 7, 0, 0]),
from_u32x8([1, 2, 3, 4, 6, 7, 0, 0]),
from_u32x8([0, 1, 2, 3, 4, 6, 7, 0]),
from_u32x8([5, 6, 7, 0, 0, 0, 0, 0]),
from_u32x8([0, 5, 6, 7, 0, 0, 0, 0]),
from_u32x8([1, 5, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 1, 5, 6, 7, 0, 0, 0]),
from_u32x8([2, 5, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 2, 5, 6, 7, 0, 0, 0]),
from_u32x8([1, 2, 5, 6, 7, 0, 0, 0]),
from_u32x8([0, 1, 2, 5, 6, 7, 0, 0]),
from_u32x8([3, 5, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 3, 5, 6, 7, 0, 0, 0]),
from_u32x8([1, 3, 5, 6, 7, 0, 0, 0]),
from_u32x8([0, 1, 3, 5, 6, 7, 0, 0]),
from_u32x8([2, 3, 5, 6, 7, 0, 0, 0]),
from_u32x8([0, 2, 3, 5, 6, 7, 0, 0]),
from_u32x8([1, 2, 3, 5, 6, 7, 0, 0]),
from_u32x8([0, 1, 2, 3, 5, 6, 7, 0]),
from_u32x8([4, 5, 6, 7, 0, 0, 0, 0]),
from_u32x8([0, 4, 5, 6, 7, 0, 0, 0]),
from_u32x8([1, 4, 5, 6, 7, 0, 0, 0]),
from_u32x8([0, 1, 4, 5, 6, 7, 0, 0]),
from_u32x8([2, 4, 5, 6, 7, 0, 0, 0]),
from_u32x8([0, 2, 4, 5, 6, 7, 0, 0]),
from_u32x8([1, 2, 4, 5, 6, 7, 0, 0]),
from_u32x8([0, 1, 2, 4, 5, 6, 7, 0]),
from_u32x8([3, 4, 5, 6, 7, 0, 0, 0]),
from_u32x8([0, 3, 4, 5, 6, 7, 0, 0]),
from_u32x8([1, 3, 4, 5, 6, 7, 0, 0]),
from_u32x8([0, 1, 3, 4, 5, 6, 7, 0]),
from_u32x8([2, 3, 4, 5, 6, 7, 0, 0]),
from_u32x8([0, 2, 3, 4, 5, 6, 7, 0]),
from_u32x8([1, 2, 3, 4, 5, 6, 7, 0]),
from_u32x8([0, 1, 2, 3, 4, 5, 6, 7]),
];

View File

@@ -1,165 +0,0 @@
use std::ops::RangeInclusive;
#[cfg(target_arch = "x86_64")]
mod avx2;
mod scalar;
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
#[repr(u8)]
enum FilterImplPerInstructionSet {
#[cfg(target_arch = "x86_64")]
AVX2 = 0u8,
Scalar = 1u8,
}
impl FilterImplPerInstructionSet {
#[inline]
pub fn is_available(&self) -> bool {
match *self {
#[cfg(target_arch = "x86_64")]
FilterImplPerInstructionSet::AVX2 => is_x86_feature_detected!("avx2"),
FilterImplPerInstructionSet::Scalar => true,
}
}
}
// List of available implementation in preferred order.
#[cfg(target_arch = "x86_64")]
const IMPLS: [FilterImplPerInstructionSet; 2] = [
FilterImplPerInstructionSet::AVX2,
FilterImplPerInstructionSet::Scalar,
];
#[cfg(not(target_arch = "x86_64"))]
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
impl FilterImplPerInstructionSet {
#[allow(unused_variables)]
#[inline]
fn from(code: u8) -> FilterImplPerInstructionSet {
#[cfg(target_arch = "x86_64")]
if code == FilterImplPerInstructionSet::AVX2 as u8 {
return FilterImplPerInstructionSet::AVX2;
}
FilterImplPerInstructionSet::Scalar
}
#[inline]
fn filter_vec_in_place(self, range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
match self {
#[cfg(target_arch = "x86_64")]
FilterImplPerInstructionSet::AVX2 => avx2::filter_vec_in_place(range, offset, output),
FilterImplPerInstructionSet::Scalar => {
scalar::filter_vec_in_place(range, offset, output)
}
}
}
}
#[inline]
fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
use std::sync::atomic::{AtomicU8, Ordering};
static INSTRUCTION_SET_BYTE: AtomicU8 = AtomicU8::new(u8::MAX);
let instruction_set_byte: u8 = INSTRUCTION_SET_BYTE.load(Ordering::Relaxed);
if instruction_set_byte == u8::MAX {
// Let's initialize the instruction set and cache it.
let instruction_set = IMPLS
.into_iter()
.find(FilterImplPerInstructionSet::is_available)
.unwrap();
INSTRUCTION_SET_BYTE.store(instruction_set as u8, Ordering::Relaxed);
return instruction_set;
}
FilterImplPerInstructionSet::from(instruction_set_byte)
}
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
get_best_available_instruction_set().filter_vec_in_place(range, offset, output)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_best_available_instruction_set() {
// This does not test much unfortunately.
// We just make sure the function returns without crashing and returns the same result.
let instruction_set = get_best_available_instruction_set();
assert_eq!(get_best_available_instruction_set(), instruction_set);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_instruction_set_to_code_from_code() {
for instruction_set in [
FilterImplPerInstructionSet::AVX2,
FilterImplPerInstructionSet::Scalar,
] {
let code = instruction_set as u8;
assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
}
}
fn test_filter_impl_empty_aux(filter_impl: FilterImplPerInstructionSet) {
let mut output = vec![];
filter_impl.filter_vec_in_place(0..=u32::MAX, 0, &mut output);
assert_eq!(&output, &[]);
}
fn test_filter_impl_simple_aux(filter_impl: FilterImplPerInstructionSet) {
let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
filter_impl.filter_vec_in_place(3..=10, 0, &mut output);
assert_eq!(&output, &[0, 3, 6, 7]);
}
fn test_filter_impl_simple_aux_shifted(filter_impl: FilterImplPerInstructionSet) {
let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
filter_impl.filter_vec_in_place(3..=10, 10, &mut output);
assert_eq!(&output, &[10, 13, 16, 17]);
}
fn test_filter_impl_simple_outside_i32_range(filter_impl: FilterImplPerInstructionSet) {
let mut output = vec![u32::MAX, i32::MAX as u32 + 1, 0, 1, 3, 1, 1, 1, 1];
filter_impl.filter_vec_in_place(1..=i32::MAX as u32 + 1u32, 0, &mut output);
assert_eq!(&output, &[1, 3, 4, 5, 6, 7, 8]);
}
fn test_filter_impl_test_suite(filter_impl: FilterImplPerInstructionSet) {
test_filter_impl_empty_aux(filter_impl);
test_filter_impl_simple_aux(filter_impl);
test_filter_impl_simple_aux_shifted(filter_impl);
test_filter_impl_simple_outside_i32_range(filter_impl);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_filter_implementation_avx2() {
if FilterImplPerInstructionSet::AVX2.is_available() {
test_filter_impl_test_suite(FilterImplPerInstructionSet::AVX2);
}
}
#[test]
fn test_filter_implementation_scalar() {
test_filter_impl_test_suite(FilterImplPerInstructionSet::Scalar);
}
#[cfg(target_arch = "x86_64")]
proptest::proptest! {
#[test]
fn test_filter_compare_scalar_and_avx2_impl_proptest(
start in proptest::prelude::any::<u32>(),
end in proptest::prelude::any::<u32>(),
offset in 0u32..2u32,
mut vals in proptest::collection::vec(0..u32::MAX, 0..30)) {
if FilterImplPerInstructionSet::AVX2.is_available() {
let mut vals_clone = vals.clone();
FilterImplPerInstructionSet::AVX2.filter_vec_in_place(start..=end, offset, &mut vals);
FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut vals_clone);
assert_eq!(&vals, &vals_clone);
}
}
}
}

View File

@@ -1,13 +0,0 @@
use std::ops::RangeInclusive;
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
// We restrict the accepted boundary, because unsigned integers & SIMD don't
// play well.
let mut output_cursor = 0;
for i in 0..output.len() {
let val = output[i];
output[output_cursor] = offset + i as u32;
output_cursor += if range.contains(&val) { 1 } else { 0 };
}
output.truncate(output_cursor);
}

View File

@@ -1,6 +1,5 @@
mod bitpacker;
mod blocked_bitpacker;
mod filter_vec;
use std::cmp::Ordering;

View File

@@ -1,90 +0,0 @@
# configuration file for git-cliff{ pattern = "foo", replace = "bar"}
# see https://github.com/orhun/git-cliff#configuration-file
[changelog]
# changelog header
header = """
"""
# template for the changelog body
# https://tera.netlify.app/docs/#introduction
body = """
{% if version %}\
{{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
==================
{% else %}\
## [unreleased]
{% endif %}\
{% for commit in commits %}
- {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
{% endfor %}
"""
# remove the leading and trailing whitespace from the template
trim = true
# changelog footer
footer = """
"""
postprocessors = [
{ pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
{ pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
{ pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
{ pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
{ pattern = 'Naveen Aiathurai', replace = "naveenann"}, # replace with github user
{ pattern = '', replace = ""}, # replace with github user
]
[git]
# parse the commits based on https://www.conventionalcommits.org
# This is required or commit.message contains the whole commit message and not just the title
conventional_commits = true
# filter out the commits that are not conventional
filter_unconventional = false
# process each line of a commit as an individual commit
split_commits = false
# regex for preprocessing the commit messages
commit_preprocessors = [
{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
]
#link_parsers = [
#{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
#]
# regex for parsing and grouping commits
commit_parsers = [
{ message = "^feat", group = "Features"},
{ message = "^fix", group = "Bug Fixes"},
{ message = "^doc", group = "Documentation"},
{ message = "^perf", group = "Performance"},
{ message = "^refactor", group = "Refactor"},
{ message = "^style", group = "Styling"},
{ message = "^test", group = "Testing"},
{ message = "^chore\\(release\\): prepare for", skip = true},
{ message = "(?i)clippy", skip = true},
{ message = "(?i)dependabot", skip = true},
{ message = "(?i)fmt", skip = true},
{ message = "(?i)bump", skip = true},
{ message = "(?i)readme", skip = true},
{ message = "(?i)comment", skip = true},
{ message = "(?i)spelling", skip = true},
{ message = "^chore", group = "Miscellaneous Tasks"},
{ body = ".*security", group = "Security"},
{ message = ".*", group = "Other", default_scope = "other"},
]
# protect breaking changes from being skipped due to matching a skipping commit_parser
protect_breaking_commits = false
# filter out the commits that are not matched by commit parsers
filter_commits = false
# glob pattern for matching git tags
tag_pattern = "v[0-9]*"
# regex for skipping tags
skip_tags = "v0.1.0-beta.1"
# regex for ignoring tags
ignore_tags = ""
# sort the tags topologically
topo_order = false
# sort the commits inside sections by oldest/newest order
sort_commits = "newest"
# limit the number of commits included in the changelog.
# limit_commits = 42

View File

@@ -1,28 +1,27 @@
[package]
name = "tantivy-columnar"
version = "0.2.0"
version = "0.1.0"
edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.11.0"
itertools = "0.10.5"
log = "0.4.17"
fnv = "1.0.7"
fastdivide = "0.4.0"
rand = { version = "0.8.5", optional = true }
measure_time = { version = "0.8.2", optional = true }
prettytable-rs = { version = "0.10.0", optional = true }
stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.2", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.6", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" }
serde = "1.0.152"
stacker = { path = "../stacker", package="tantivy-stacker"}
sstable = { path = "../sstable", package = "tantivy-sstable" }
common = { path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
[dev-dependencies]
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
rand = "0.8.5"
[features]
unstable = []

View File

@@ -1,6 +1,7 @@
# zero to one
* revisit line codec
* removal of all rows of a column in the schema due to deletes
* add columns from schema on merge
* Plugging JSON
* replug examples

View File

@@ -1,132 +0,0 @@
use std::cmp::Ordering;
use crate::{Column, DocId, RowId};
#[derive(Debug, Default, Clone)]
pub struct ColumnBlockAccessor<T> {
val_cache: Vec<T>,
docid_cache: Vec<DocId>,
missing_docids_cache: Vec<DocId>,
row_id_cache: Vec<RowId>,
}
impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
ColumnBlockAccessor<T>
{
#[inline]
pub fn fetch_block(&mut self, docs: &[u32], accessor: &Column<T>) {
self.docid_cache.clear();
self.row_id_cache.clear();
accessor.row_ids_for_docs(docs, &mut self.docid_cache, &mut self.row_id_cache);
self.val_cache.resize(self.row_id_cache.len(), T::default());
accessor
.values
.get_vals(&self.row_id_cache, &mut self.val_cache);
}
#[inline]
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
self.fetch_block(docs, accessor);
// We can compare docid_cache with docs to find missing docs
if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() {
self.missing_docids_cache.clear();
find_missing_docs(docs, &self.docid_cache, |doc| {
self.missing_docids_cache.push(doc);
self.val_cache.push(missing);
});
self.docid_cache
.extend_from_slice(&self.missing_docids_cache);
}
}
#[inline]
pub fn iter_vals(&self) -> impl Iterator<Item = T> + '_ {
self.val_cache.iter().cloned()
}
#[inline]
pub fn iter_docid_vals(&self) -> impl Iterator<Item = (DocId, T)> + '_ {
self.docid_cache
.iter()
.cloned()
.zip(self.val_cache.iter().cloned())
}
}
/// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`.
/// Return all docs that are not in `hits`.
fn find_missing_docs<F>(docs: &[u32], hits: &[u32], mut callback: F)
where F: FnMut(u32) {
let mut docs_iter = docs.iter();
let mut hits_iter = hits.iter();
let mut doc = docs_iter.next();
let mut hit = hits_iter.next();
while let (Some(&current_doc), Some(&current_hit)) = (doc, hit) {
match current_doc.cmp(&current_hit) {
Ordering::Less => {
callback(current_doc);
doc = docs_iter.next();
}
Ordering::Equal => {
doc = docs_iter.next();
hit = hits_iter.next();
}
Ordering::Greater => {
hit = hits_iter.next();
}
}
}
while let Some(&current_doc) = doc {
callback(current_doc);
doc = docs_iter.next();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_missing_docs() {
let docs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![1, 3, 5, 7, 9]);
}
#[test]
fn test_find_missing_docs_empty() {
let docs: Vec<u32> = Vec::new();
let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![]);
}
#[test]
fn test_find_missing_docs_all_missing() {
let docs: Vec<u32> = vec![1, 2, 3, 4, 5];
let hits: Vec<u32> = Vec::new();
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![1, 2, 3, 4, 5]);
}
}

View File

@@ -1,6 +1,6 @@
use std::io;
use std::ops::Deref;
use std::sync::Arc;
use std::{fmt, io};
use sstable::{Dictionary, VoidSSTable};
@@ -21,22 +21,7 @@ pub struct BytesColumn {
pub(crate) term_ord_column: Column<u64>,
}
impl fmt::Debug for BytesColumn {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BytesColumn")
.field("term_ord_column", &self.term_ord_column)
.finish()
}
}
impl BytesColumn {
pub fn empty(num_docs: u32) -> BytesColumn {
BytesColumn {
dictionary: Arc::new(Dictionary::empty()),
term_ord_column: Column::build_empty_column(num_docs),
}
}
/// Fills the given `output` buffer with the term associated to the ordinal `ord`.
///
/// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
@@ -51,7 +36,7 @@ impl BytesColumn {
}
pub fn term_ords(&self, row_id: RowId) -> impl Iterator<Item = u64> + '_ {
self.term_ord_column.values_for_doc(row_id)
self.term_ord_column.values(row_id)
}
/// Returns the column of ordinals
@@ -71,12 +56,6 @@ impl BytesColumn {
#[derive(Clone)]
pub struct StrColumn(BytesColumn);
impl fmt::Debug for StrColumn {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self.term_ord_column)
}
}
impl From<StrColumn> for BytesColumn {
fn from(str_column: StrColumn) -> BytesColumn {
str_column.0
@@ -84,7 +63,7 @@ impl From<StrColumn> for BytesColumn {
}
impl StrColumn {
pub fn wrap(bytes_column: BytesColumn) -> StrColumn {
pub(crate) fn wrap(bytes_column: BytesColumn) -> StrColumn {
StrColumn(bytes_column)
}

View File

@@ -1,7 +1,7 @@
mod dictionary_encoded;
mod serialize;
use std::fmt::{self, Debug};
use std::fmt::Debug;
use std::io::Write;
use std::ops::{Deref, Range, RangeInclusive};
use std::sync::Arc;
@@ -16,33 +16,14 @@ pub use serialize::{
use crate::column_index::ColumnIndex;
use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
use crate::column_values::{monotonic_map_column, ColumnValues};
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
use crate::{Cardinality, MonotonicallyMappableToU64, RowId};
#[derive(Clone)]
pub struct Column<T = u64> {
pub index: ColumnIndex,
pub idx: ColumnIndex,
pub values: Arc<dyn ColumnValues<T>>,
}
impl<T: Debug + PartialOrd + Send + Sync + Copy + 'static> Debug for Column<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let num_docs = self.num_docs();
let entries = (0..num_docs)
.map(|i| (i, self.values_for_doc(i).collect::<Vec<_>>()))
.filter(|(_, vals)| !vals.is_empty());
f.debug_map().entries(entries).finish()
}
}
impl<T: PartialOrd + Default> Column<T> {
pub fn build_empty_column(num_docs: u32) -> Column<T> {
Column {
index: ColumnIndex::Empty { num_docs },
values: Arc::new(EmptyColumnValues),
}
}
}
impl<T: MonotonicallyMappableToU64> Column<T> {
pub fn to_u64_monotonic(self) -> Column<u64> {
let values = Arc::new(monotonic_map_column(
@@ -50,22 +31,20 @@ impl<T: MonotonicallyMappableToU64> Column<T> {
StrictlyMonotonicMappingToInternal::<T>::new(),
));
Column {
index: self.index,
idx: self.idx,
values,
}
}
}
impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
#[inline]
pub fn get_cardinality(&self) -> Cardinality {
self.index.get_cardinality()
self.idx.get_cardinality()
}
pub fn num_docs(&self) -> RowId {
match &self.index {
ColumnIndex::Empty { num_docs } => *num_docs,
ColumnIndex::Full => self.values.num_vals(),
match &self.idx {
ColumnIndex::Full => self.values.num_vals() as u32,
ColumnIndex::Optional(optional_index) => optional_index.num_docs(),
ColumnIndex::Multivalued(col_index) => {
// The multivalued index contains all value start row_id,
@@ -84,28 +63,11 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
pub fn first(&self, row_id: RowId) -> Option<T> {
self.values_for_doc(row_id).next()
self.values(row_id).next()
}
/// Translates a block of docis to row_ids.
///
/// returns the row_ids and the matching docids on the same index
/// e.g.
/// DocId In: [0, 5, 6]
/// DocId Out: [0, 0, 6, 6]
/// RowId Out: [0, 1, 2, 3]
#[inline]
pub fn row_ids_for_docs(
&self,
doc_ids: &[DocId],
doc_ids_out: &mut Vec<DocId>,
row_ids: &mut Vec<RowId>,
) {
self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids)
}
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
self.value_row_ids(doc_id)
pub fn values(&self, row_id: RowId) -> impl Iterator<Item = T> + '_ {
self.value_row_ids(row_id)
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
}
@@ -115,28 +77,26 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
&self,
value_range: RangeInclusive<T>,
selected_docid_range: Range<u32>,
doc_ids: &mut Vec<u32>,
docids: &mut Vec<u32>,
) {
// convert passed docid range to row id range
let rowid_range = self
.index
.docid_range_to_rowids(selected_docid_range.clone());
let rowid_range = self.idx.docid_range_to_rowids(selected_docid_range.clone());
// Load rows
self.values
.get_row_ids_for_value_range(value_range, rowid_range, doc_ids);
.get_row_ids_for_value_range(value_range, rowid_range, docids);
// Convert rows to docids
self.index
.select_batch_in_place(selected_docid_range.start, doc_ids);
self.idx
.select_batch_in_place(docids, selected_docid_range.start);
}
/// Fills the output vector with the (possibly multiple values that are associated_with
/// Fils the output vector with the (possibly multiple values that are associated_with
/// `row_id`.
///
/// This method clears the `output` vector.
pub fn fill_vals(&self, row_id: RowId, output: &mut Vec<T>) {
output.clear();
output.extend(self.values_for_doc(row_id));
output.extend(self.values(row_id));
}
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
@@ -151,7 +111,7 @@ impl<T> Deref for Column<T> {
type Target = ColumnIndex;
fn deref(&self) -> &Self::Target {
&self.index
&self.idx
}
}
@@ -189,8 +149,7 @@ impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T>
}
fn num_vals(&self) -> u32 {
match &self.column.index {
ColumnIndex::Empty { .. } => 0u32,
match &self.column.idx {
ColumnIndex::Full => self.column.values.num_vals(),
ColumnIndex::Optional(optional_idx) => optional_idx.num_docs(),
ColumnIndex::Multivalued(multivalue_idx) => multivalue_idx.num_docs(),

View File

@@ -7,10 +7,9 @@ use sstable::Dictionary;
use crate::column::{BytesColumn, Column};
use crate::column_index::{serialize_column_index, SerializableColumnIndex};
use crate::column_values::{
load_u64_based_column_values, serialize_column_values_u128, serialize_u64_based_column_values,
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
};
use crate::column_values::serialize::serialize_column_values_u128;
use crate::column_values::u64_based::{serialize_u64_based_column_values, CodecType};
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use crate::iterable::Iterable;
use crate::StrColumn;
@@ -50,9 +49,10 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_values = load_u64_based_column_values(column_values_data)?;
let column_values =
crate::column_values::u64_based::load_u64_based_column_values(column_values_data)?;
Ok(Column {
index: column_index,
idx: column_index,
values: column_values,
})
}
@@ -71,7 +71,7 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
Ok(Column {
index: column_index,
idx: column_index,
values: column_values,
})
}

View File

@@ -1,82 +1,29 @@
mod shuffled;
mod stacked;
use common::ReadOnlyBitSet;
use shuffled::merge_column_index_shuffled;
use stacked::merge_column_index_stacked;
use crate::column_index::SerializableColumnIndex;
use crate::{Cardinality, ColumnIndex, MergeRowOrder};
fn detect_cardinality_single_column_index(
column_index: &ColumnIndex,
alive_bitset_opt: &Option<ReadOnlyBitSet>,
) -> Cardinality {
let Some(alive_bitset) = alive_bitset_opt else {
return column_index.get_cardinality();
};
let cardinality_before_deletes = column_index.get_cardinality();
if cardinality_before_deletes == Cardinality::Full {
// The columnar cardinality can only become more restrictive in the presence of deletes
// (where cardinality sorted from the more restrictive to the least restrictive are Full,
// Optional, Multivalued)
//
// If we are already "Full", we are guaranteed to stay "Full" after deletes.
return Cardinality::Full;
}
let mut cardinality_so_far = Cardinality::Full;
for doc_id in alive_bitset.iter() {
let num_values = column_index.value_row_ids(doc_id).len();
let row_cardinality = match num_values {
0 => Cardinality::Optional,
1 => Cardinality::Full,
_ => Cardinality::Multivalued,
};
cardinality_so_far = cardinality_so_far.max(row_cardinality);
if cardinality_so_far >= cardinality_before_deletes {
// There won't be any improvement in the cardinality.
// We can early exit.
return cardinality_before_deletes;
}
}
cardinality_so_far
}
fn detect_cardinality(
column_indexes: &[ColumnIndex],
merge_row_order: &MergeRowOrder,
) -> Cardinality {
match merge_row_order {
MergeRowOrder::Stack(_) => column_indexes
.iter()
.map(ColumnIndex::get_cardinality)
.max()
.unwrap_or(Cardinality::Full),
MergeRowOrder::Shuffled(shuffle_merge_order) => {
let mut merged_cardinality = Cardinality::Full;
for (column_index, alive_bitset_opt) in column_indexes
.iter()
.zip(shuffle_merge_order.alive_bitsets.iter())
{
let cardinality: Cardinality =
detect_cardinality_single_column_index(column_index, alive_bitset_opt);
if cardinality == Cardinality::Multivalued {
return cardinality;
}
merged_cardinality = merged_cardinality.max(cardinality);
}
merged_cardinality
}
}
// For simplification, we never have cardinality go down due to deletes.
fn detect_cardinality(columns: &[Option<ColumnIndex>]) -> Cardinality {
columns
.iter()
.flatten()
.map(ColumnIndex::get_cardinality)
.max()
.unwrap_or(Cardinality::Full)
}
pub fn merge_column_index<'a>(
columns: &'a [ColumnIndex],
columns: &'a [Option<ColumnIndex>],
merge_row_order: &'a MergeRowOrder,
) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
let cardinality_after_merge = detect_cardinality(columns);
match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => {
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
@@ -98,61 +45,42 @@ mod tests {
use crate::column_index::merge::detect_cardinality;
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
use crate::{
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
};
use crate::{Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder};
#[test]
fn test_detect_cardinality() {
assert_eq!(
detect_cardinality(&[], &StackMergeOrder::stack_for_test(&[]).into()),
Cardinality::Full
);
assert_eq!(detect_cardinality(&[]), Cardinality::Full);
let optional_index: ColumnIndex = OptionalIndex::for_test(1, &[]).into();
let multivalued_index: ColumnIndex = MultiValueIndex::for_test(&[0, 1]).into();
assert_eq!(
detect_cardinality(
&[optional_index.clone(), ColumnIndex::Empty { num_docs: 0 }],
&StackMergeOrder::stack_for_test(&[1, 0]).into()
),
detect_cardinality(&[Some(optional_index.clone()), None]),
Cardinality::Optional
);
assert_eq!(
detect_cardinality(
&[optional_index.clone(), ColumnIndex::Full],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
detect_cardinality(&[Some(optional_index.clone()), Some(ColumnIndex::Full)]),
Cardinality::Optional
);
assert_eq!(
detect_cardinality(
&[
multivalued_index.clone(),
ColumnIndex::Empty { num_docs: 0 }
],
&StackMergeOrder::stack_for_test(&[1, 0]).into()
),
detect_cardinality(&[Some(multivalued_index.clone()), None]),
Cardinality::Multivalued
);
assert_eq!(
detect_cardinality(
&[multivalued_index.clone(), optional_index.clone()],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
detect_cardinality(&[
Some(multivalued_index.clone()),
Some(optional_index.clone())
]),
Cardinality::Multivalued
);
assert_eq!(
detect_cardinality(
&[optional_index, multivalued_index],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
detect_cardinality(&[Some(optional_index), Some(multivalued_index)]),
Cardinality::Multivalued
);
}
#[test]
fn test_merge_index_multivalued_sorted() {
let column_indexes: Vec<ColumnIndex> = vec![MultiValueIndex::for_test(&[0, 2, 5]).into()];
let column_indexes: Vec<Option<ColumnIndex>> =
vec![Some(MultiValueIndex::for_test(&[0, 2, 5]).into())];
let merge_row_order: MergeRowOrder = ShuffleMergeOrder::for_test(
&[2],
vec![
@@ -168,19 +96,18 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}
#[test]
fn test_merge_index_multivalued_sorted_several_segment() {
let column_indexes: Vec<ColumnIndex> = vec![
MultiValueIndex::for_test(&[0, 2, 5]).into(),
ColumnIndex::Empty { num_docs: 0 },
MultiValueIndex::for_test(&[0, 1, 4]).into(),
let column_indexes: Vec<Option<ColumnIndex>> = vec![
Some(MultiValueIndex::for_test(&[0, 2, 5]).into()),
None,
Some(MultiValueIndex::for_test(&[0, 1, 4]).into()),
];
let merge_row_order: MergeRowOrder = ShuffleMergeOrder::for_test(
&[2, 0, 2],
@@ -201,9 +128,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}

View File

@@ -5,7 +5,7 @@ use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
pub fn merge_column_index_shuffled<'a>(
column_indexes: &'a [ColumnIndex],
column_indexes: &'a [Option<ColumnIndex>],
cardinality_after_merge: Cardinality,
shuffle_merge_order: &'a ShuffleMergeOrder,
) -> SerializableColumnIndex<'a> {
@@ -33,41 +33,41 @@ pub fn merge_column_index_shuffled<'a>(
///
/// In other words the column_indexes passed as argument may NOT be multivalued.
fn merge_column_index_shuffled_optional<'a>(
column_indexes: &'a [ColumnIndex],
column_indexes: &'a [Option<ColumnIndex>],
merge_order: &'a ShuffleMergeOrder,
) -> Box<dyn Iterable<RowId> + 'a> {
Box::new(ShuffledIndex {
Box::new(ShuffledOptionalIndex {
column_indexes,
merge_order,
})
}
struct ShuffledIndex<'a> {
column_indexes: &'a [ColumnIndex],
struct ShuffledOptionalIndex<'a> {
column_indexes: &'a [Option<ColumnIndex>],
merge_order: &'a ShuffleMergeOrder,
}
impl<'a> Iterable<u32> for ShuffledIndex<'a> {
impl<'a> Iterable<u32> for ShuffledOptionalIndex<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(
self.merge_order
.iter_new_to_old_row_addrs()
.enumerate()
.filter_map(|(new_row_id, old_row_addr)| {
let column_index = &self.column_indexes[old_row_addr.segment_ord as usize];
let row_id = new_row_id as u32;
if column_index.has_value(old_row_addr.row_id) {
Some(row_id)
} else {
None
}
}),
)
Box::new(self.merge_order
.iter_new_to_old_row_addrs()
.enumerate()
.filter_map(|(new_row_id, old_row_addr)| {
let Some(column_index) = &self.column_indexes[old_row_addr.segment_ord as usize] else {
return None;
};
let row_id = new_row_id as u32;
if column_index.has_value(old_row_addr.row_id) {
Some(row_id)
} else {
None
}
}))
}
}
fn merge_column_index_shuffled_multivalued<'a>(
column_indexes: &'a [ColumnIndex],
column_indexes: &'a [Option<ColumnIndex>],
merge_order: &'a ShuffleMergeOrder,
) -> Box<dyn Iterable<RowId> + 'a> {
Box::new(ShuffledMultivaluedIndex {
@@ -77,18 +77,20 @@ fn merge_column_index_shuffled_multivalued<'a>(
}
struct ShuffledMultivaluedIndex<'a> {
column_indexes: &'a [ColumnIndex],
column_indexes: &'a [Option<ColumnIndex>],
merge_order: &'a ShuffleMergeOrder,
}
fn iter_num_values<'a>(
column_indexes: &'a [ColumnIndex],
column_indexes: &'a [Option<ColumnIndex>],
merge_order: &'a ShuffleMergeOrder,
) -> impl Iterator<Item = u32> + 'a {
merge_order.iter_new_to_old_row_addrs().map(|row_addr| {
let column_index = &column_indexes[row_addr.segment_ord as usize];
let Some(column_index) = &column_indexes[row_addr.segment_ord as usize] else {
// No values in the entire column. It surely means there are 0 values associated to this row.
return 0u32;
};
match column_index {
ColumnIndex::Empty { .. } => 0u32,
ColumnIndex::Full => 1,
ColumnIndex::Optional(optional_index) => {
u32::from(optional_index.contains(row_addr.row_id))
@@ -140,7 +142,7 @@ mod tests {
#[test]
fn test_merge_column_index_optional_shuffle() {
let optional_index: ColumnIndex = OptionalIndex::for_test(2, &[0]).into();
let column_indexes = vec![optional_index, ColumnIndex::Full];
let column_indexes = vec![Some(optional_index), Some(ColumnIndex::Full)];
let row_addrs = vec![
RowAddr {
segment_ord: 0u32,
@@ -157,13 +159,7 @@ mod tests {
Cardinality::Optional,
&shuffle_merge_order,
);
let SerializableColumnIndex::Optional {
non_null_row_ids,
num_rows,
} = serializable_index
else {
panic!()
};
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
assert_eq!(num_rows, 2);
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
assert_eq!(&non_null_rows, &[1]);

View File

@@ -9,7 +9,7 @@ use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
///
/// There are no sort nor deletes involved.
pub fn merge_column_index_stacked<'a>(
columns: &'a [ColumnIndex],
columns: &'a [Option<ColumnIndex>],
cardinality_after_merge: Cardinality,
stack_merge_order: &'a StackMergeOrder,
) -> SerializableColumnIndex<'a> {
@@ -33,7 +33,7 @@ pub fn merge_column_index_stacked<'a>(
}
struct StackedOptionalIndex<'a> {
columns: &'a [ColumnIndex],
columns: &'a [Option<ColumnIndex>],
stack_merge_order: &'a StackMergeOrder,
}
@@ -46,16 +46,16 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
.flat_map(|(columnar_id, column_index_opt)| {
let columnar_row_range = self.stack_merge_order.columnar_range(columnar_id);
let rows_it: Box<dyn Iterator<Item = RowId>> = match column_index_opt {
ColumnIndex::Full => Box::new(columnar_row_range),
ColumnIndex::Optional(optional_index) => Box::new(
Some(ColumnIndex::Full) => Box::new(columnar_row_range),
Some(ColumnIndex::Optional(optional_index)) => Box::new(
optional_index
.iter_rows()
.map(move |row_id: RowId| columnar_row_range.start + row_id),
),
ColumnIndex::Multivalued(_) => {
Some(ColumnIndex::Multivalued(_)) => {
panic!("No multivalued index is allowed when stacking column index");
}
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
None => Box::new(std::iter::empty()),
};
rows_it
}),
@@ -65,18 +65,18 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
#[derive(Clone, Copy)]
struct StackedMultivaluedIndex<'a> {
columns: &'a [ColumnIndex],
columns: &'a [Option<ColumnIndex>],
stack_merge_order: &'a StackMergeOrder,
}
fn convert_column_opt_to_multivalued_index<'a>(
column_index_opt: &'a ColumnIndex,
column_index_opt: Option<&'a ColumnIndex>,
num_rows: RowId,
) -> Box<dyn Iterator<Item = RowId> + 'a> {
match column_index_opt {
ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
ColumnIndex::Full => Box::new(0..num_rows + 1),
ColumnIndex::Optional(optional_index) => {
None => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
Some(ColumnIndex::Full) => Box::new(0..num_rows + 1),
Some(ColumnIndex::Optional(optional_index)) => {
Box::new(
(0..num_rows)
// TODO optimize
@@ -84,7 +84,9 @@ fn convert_column_opt_to_multivalued_index<'a>(
.chain(std::iter::once(optional_index.num_non_nulls())),
)
}
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
Some(ColumnIndex::Multivalued(multivalued_index)) => {
multivalued_index.start_index_column.iter()
}
}
}
@@ -93,6 +95,7 @@ impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
let multivalued_indexes =
self.columns
.iter()
.map(Option::as_ref)
.enumerate()
.map(|(columnar_id, column_opt)| {
let num_rows =

View File

@@ -1,8 +1,3 @@
//! # `column_index`
//!
//! `column_index` provides rank and select operations to associate positions when not all
//! documents have exactly one element.
mod merge;
mod multivalued_index;
mod optional_index;
@@ -17,11 +12,8 @@ pub use serialize::{open_column_index, serialize_column_index, SerializableColum
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::{Cardinality, DocId, RowId};
#[derive(Clone, Debug)]
#[derive(Clone)]
pub enum ColumnIndex {
Empty {
num_docs: u32,
},
Full,
Optional(OptionalIndex),
/// In addition, at index num_rows, an extra value is added
@@ -42,19 +34,9 @@ impl From<MultiValueIndex> for ColumnIndex {
}
impl ColumnIndex {
#[inline]
pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_))
}
/// Returns the cardinality of the column index.
///
/// By convention, if the column contains no docs, we consider that it is
/// full.
#[inline]
pub fn get_cardinality(&self) -> Cardinality {
match self {
ColumnIndex::Empty { num_docs: 0 } | ColumnIndex::Full => Cardinality::Full,
ColumnIndex::Empty { .. } => Cardinality::Optional,
ColumnIndex::Full => Cardinality::Full,
ColumnIndex::Optional(_) => Cardinality::Optional,
ColumnIndex::Multivalued(_) => Cardinality::Multivalued,
}
@@ -63,7 +45,6 @@ impl ColumnIndex {
/// Returns true if and only if there are at least one value associated to the row.
pub fn has_value(&self, doc_id: DocId) -> bool {
match self {
ColumnIndex::Empty { .. } => false,
ColumnIndex::Full => true,
ColumnIndex::Optional(optional_index) => optional_index.contains(doc_id),
ColumnIndex::Multivalued(multivalued_index) => {
@@ -74,7 +55,6 @@ impl ColumnIndex {
pub fn value_row_ids(&self, doc_id: DocId) -> Range<RowId> {
match self {
ColumnIndex::Empty { .. } => 0..0,
ColumnIndex::Full => doc_id..doc_id + 1,
ColumnIndex::Optional(optional_index) => {
if let Some(val) = optional_index.rank_if_exists(doc_id) {
@@ -87,48 +67,8 @@ impl ColumnIndex {
}
}
/// Translates a block of docis to row_ids.
///
/// returns the row_ids and the matching docids on the same index
/// e.g.
/// DocId In: [0, 5, 6]
/// DocId Out: [0, 0, 6, 6]
/// RowId Out: [0, 1, 2, 3]
#[inline]
pub fn docids_to_rowids(
&self,
doc_ids: &[DocId],
doc_ids_out: &mut Vec<DocId>,
row_ids: &mut Vec<RowId>,
) {
match self {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => {
doc_ids_out.extend_from_slice(doc_ids);
row_ids.extend_from_slice(doc_ids);
}
ColumnIndex::Optional(optional_index) => {
for doc_id in doc_ids {
if let Some(row_id) = optional_index.rank_if_exists(*doc_id) {
doc_ids_out.push(*doc_id);
row_ids.push(row_id);
}
}
}
ColumnIndex::Multivalued(multivalued_index) => {
for doc_id in doc_ids {
for row_id in multivalued_index.range(*doc_id) {
doc_ids_out.push(*doc_id);
row_ids.push(row_id);
}
}
}
}
}
pub fn docid_range_to_rowids(&self, doc_id: Range<DocId>) -> Range<RowId> {
match self {
ColumnIndex::Empty { .. } => 0..0,
ColumnIndex::Full => doc_id,
ColumnIndex::Optional(optional_index) => {
let row_start = optional_index.rank(doc_id.start);
@@ -147,11 +87,8 @@ impl ColumnIndex {
}
}
pub fn select_batch_in_place(&self, doc_id_start: DocId, rank_ids: &mut Vec<RowId>) {
pub fn select_batch_in_place(&self, rank_ids: &mut Vec<RowId>, doc_id_start: DocId) {
match self {
ColumnIndex::Empty { .. } => {
rank_ids.clear();
}
ColumnIndex::Full => {
// No need to do anything:
// value_idx and row_idx are the same.
@@ -165,21 +102,3 @@ impl ColumnIndex {
}
}
}
#[cfg(test)]
mod tests {
use crate::{Cardinality, ColumnIndex};
#[test]
fn test_column_index_get_cardinality() {
assert_eq!(
ColumnIndex::Empty { num_docs: 0 }.get_cardinality(),
Cardinality::Full
);
assert_eq!(ColumnIndex::Full.get_cardinality(), Cardinality::Full);
assert_eq!(
ColumnIndex::Empty { num_docs: 1 }.get_cardinality(),
Cardinality::Optional
);
}
}

View File

@@ -5,9 +5,8 @@ use std::sync::Arc;
use common::OwnedBytes;
use crate::column_values::{
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
};
use crate::column_values::u64_based::CodecType;
use crate::column_values::ColumnValues;
use crate::iterable::Iterable;
use crate::{DocId, RowId};
@@ -15,7 +14,7 @@ pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>,
output: &mut impl Write,
) -> io::Result<()> {
serialize_u64_based_column_values(
crate::column_values::u64_based::serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
@@ -24,7 +23,8 @@ pub fn serialize_multivalued_index(
}
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
let start_index_column: Arc<dyn ColumnValues<RowId>> =
crate::column_values::u64_based::load_u64_based_column_values(bytes)?;
Ok(MultiValueIndex { start_index_column })
}
@@ -35,14 +35,6 @@ pub struct MultiValueIndex {
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
}
impl std::fmt::Debug for MultiValueIndex {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("MultiValuedIndex")
.field("num_rows", &self.start_index_column.num_vals())
.finish_non_exhaustive()
}
}
impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
MultiValueIndex { start_index_column }
@@ -91,13 +83,13 @@ impl MultiValueIndex {
let mut cur_doc = docid_start;
let mut last_doc = None;
assert!(self.start_index_column.get_val(docid_start) <= ranks[0]);
assert!(self.start_index_column.get_val(docid_start) as u32 <= ranks[0]);
let mut write_doc_pos = 0;
for i in 0..ranks.len() {
let pos = ranks[i];
loop {
let end = self.start_index_column.get_val(cur_doc + 1);
let end = self.start_index_column.get_val(cur_doc + 1) as u32;
if end > pos {
ranks[write_doc_pos] = cur_doc;
write_doc_pos += if last_doc == Some(cur_doc) { 0 } else { 1 };
@@ -114,8 +106,11 @@ impl MultiValueIndex {
#[cfg(test)]
mod tests {
use std::ops::Range;
use std::sync::Arc;
use super::MultiValueIndex;
use crate::column_values::IterColumn;
use crate::{ColumnValues, RowId};
fn index_to_pos_helper(
index: &MultiValueIndex,
@@ -129,7 +124,9 @@ mod tests {
#[test]
fn test_positions_to_docid() {
let index = MultiValueIndex::for_test(&[0, 10, 12, 15, 22, 23]);
let offsets: Vec<RowId> = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
let column: Arc<dyn ColumnValues<RowId>> = Arc::new(IterColumn::from(offsets.into_iter()));
let index = MultiValueIndex::from(column);
assert_eq!(index.num_docs(), 5);
let positions = &[10u32, 11, 15, 20, 21, 22];
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);

View File

@@ -88,15 +88,6 @@ pub struct OptionalIndex {
block_metas: Arc<[BlockMeta]>,
}
impl std::fmt::Debug for OptionalIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OptionalIndex")
.field("num_rows", &self.num_rows)
.field("num_non_null_rows", &self.num_non_null_rows)
.finish_non_exhaustive()
}
}
/// Splits a value address into lower and upper 16bits.
/// The lower 16 bits are the value in the block
/// The upper 16 bits are the block index
@@ -449,7 +440,7 @@ impl SerializedBlockMeta {
#[inline]
fn is_sparse(num_rows_in_block: u32) -> bool {
num_rows_in_block < DENSE_BLOCK_THRESHOLD
num_rows_in_block < DENSE_BLOCK_THRESHOLD as u32
}
fn deserialize_optional_index_block_metadatas(
@@ -457,7 +448,7 @@ fn deserialize_optional_index_block_metadatas(
num_rows: u32,
) -> (Box<[BlockMeta]>, u32) {
let num_blocks = data.len() / SERIALIZED_BLOCK_META_NUM_BYTES;
let mut block_metas = Vec::with_capacity(num_blocks + 1);
let mut block_metas = Vec::with_capacity(num_blocks as usize + 1);
let mut start_byte_offset = 0;
let mut non_null_rows_before_block = 0;
for block_meta_bytes in data.chunks_exact(SERIALIZED_BLOCK_META_NUM_BYTES) {
@@ -488,7 +479,7 @@ fn deserialize_optional_index_block_metadatas(
block_variant,
});
start_byte_offset += block_variant.num_bytes_in_block();
non_null_rows_before_block += num_non_null_rows;
non_null_rows_before_block += num_non_null_rows as u32;
}
block_metas.resize(
((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize,

View File

@@ -32,7 +32,7 @@ pub const MINI_BLOCK_NUM_BYTES: usize = MINI_BLOCK_BITVEC_NUM_BYTES + MINI_BLOCK
/// Number of bytes in a dense block.
pub const DENSE_BLOCK_NUM_BYTES: u32 =
(ELEMENTS_PER_BLOCK / ELEMENTS_PER_MINI_BLOCK as u32) * MINI_BLOCK_NUM_BYTES as u32;
(ELEMENTS_PER_BLOCK as u32 / ELEMENTS_PER_MINI_BLOCK as u32) * MINI_BLOCK_NUM_BYTES as u32;
pub struct DenseBlockCodec;
@@ -229,7 +229,7 @@ pub fn serialize_dense_codec(
while block_id > current_block_id {
let dense_mini_block = DenseMiniBlock {
bitvec: block,
rank: non_null_rows_before,
rank: non_null_rows_before as u16,
};
output.write_all(&dense_mini_block.to_bytes())?;
non_null_rows_before += block.count_ones() as u16;

View File

@@ -37,7 +37,7 @@ proptest! {
fn test_with_random_sets_simple() {
let vals = 10..BLOCK_SIZE * 2;
let mut out: Vec<u8> = Vec::new();
serialize_optional_index(&vals, 100, &mut out).unwrap();
serialize_optional_index(&vals.clone(), 100, &mut out).unwrap();
let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
let ranks: Vec<u32> = (65_472u32..65_473u32).collect();
let els: Vec<u32> = ranks.iter().copied().map(|rank| rank + 10).collect();
@@ -215,12 +215,12 @@ mod bench {
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.filter(|(pos, val)| *val)
.map(|(pos, _)| pos as RowId)
.collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap()
let codec = open_optional_index(OwnedBytes::new(out)).unwrap();
codec
}
fn random_range_iterator(
@@ -242,7 +242,7 @@ mod bench {
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)

View File

@@ -30,7 +30,6 @@ impl<'a> SerializableColumnIndex<'a> {
}
}
/// Serialize a column index.
pub fn serialize_column_index(
column_index: SerializableColumnIndex,
output: &mut impl Write,
@@ -52,7 +51,6 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes)
}
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() {
return Err(io::Error::new(

View File

@@ -1,135 +0,0 @@
use std::sync::Arc;
use common::OwnedBytes;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::{self, Bencher};
use super::*;
use crate::column_values::u64_based::*;
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55000_u64)
.map(|num| num + rng.gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default();
for val in vals {
stats_collector.collect(val);
}
stats_collector.stats()
}
#[inline(never)]
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
let mut codec_serializer = Codec::estimator();
for val in data {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
Codec::load(OwnedBytes::new(bytes)).unwrap()
}
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = get_reader_for_bench::<Codec>(data);
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
#[inline(never)]
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = Arc::new(get_reader_for_bench::<Codec>(data));
bench_get_dynamic_helper(b, col);
}
fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let stats = compute_stats(data.iter().cloned());
let mut bytes = Vec::new();
b.iter(|| {
bytes.clear();
let mut codec_serializer = Codec::estimator();
for val in data.iter().take(1024) {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
});
}
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
}

View File

@@ -0,0 +1,367 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
use std::sync::Arc;
use tantivy_bitpacker::minmax;
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
use crate::RowId;
/// `ColumnValues` provides access to a dense field column.
///
/// `Column` are just a wrapper over `ColumnValues` and a `ColumnIndex`.
pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// Must panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
#[inline(always)]
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32);
}
}
/// Get the row ids of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
#[inline(always)]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range.start..row_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
row_id_hits.push(idx);
}
}
}
/// Returns the minimum value for this fast field.
///
/// This min_value may not be exact.
/// For instance, the min value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.min_value()`.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// This max_value may not be exact.
/// For instance, the max value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.max_value()`.
fn max_value(&self) -> T;
/// The number of values in the column.
fn num_vals(&self) -> u32;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
#[inline(always)]
fn get_val(&self, idx: u32) -> T {
self.as_ref().get_val(idx)
}
#[inline(always)]
fn min_value(&self) -> T {
self.as_ref().min_value()
}
#[inline(always)]
fn max_value(&self) -> T {
self.as_ref().max_value()
}
#[inline(always)]
fn num_vals(&self) -> u32 {
self.as_ref().num_vals()
}
#[inline(always)]
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
self.as_ref().iter()
}
#[inline(always)]
fn get_range(&self, start: u64, output: &mut [T]) {
self.as_ref().get_range(start, output)
}
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
self.as_ref().get_row_ids_for_value_range(value_range, row_id_range, row_id_hits)
}
}
/// VecColumn provides `Column` over a slice.
pub struct VecColumn<'a, T = u64> {
pub(crate) values: &'a [T],
pub(crate) min_value: T,
pub(crate) max_value: T,
}
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> ColumnValues<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]
}
fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
Box::new(self.values.iter().copied())
}
fn min_value(&self) -> T {
self.min_value
}
fn max_value(&self) -> T {
self.max_value
}
fn num_vals(&self) -> u32 {
self.values.len() as u32
}
fn get_range(&self, start: u64, output: &mut [T]) {
output.copy_from_slice(&self.values[start as usize..][..output.len()])
}
}
impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
where V: AsRef<[T]> + ?Sized
{
fn from(values: &'a V) -> Self {
let values = values.as_ref();
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
Self {
values,
min_value,
max_value,
}
}
}
struct MonotonicMappingColumn<C, T, Input> {
from_column: C,
monotonic_mapping: T,
_phantom: PhantomData<Input>,
}
/// Creates a view of a column transformed by a strictly monotonic mapping. See
/// [`StrictlyMonotonicFn`].
///
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
/// monotonic_mapping.mapping() is expected to be injective, and we should always have
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
///
/// The inverse of the mapping is required for:
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column.
///
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
/// monotonic_mapping during serialization.
pub fn monotonic_map_column<C, T, Input, Output>(
from_column: C,
monotonic_mapping: T,
) -> impl ColumnValues<Output>
where
C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Debug + Send + Sync + Clone,
Output: PartialOrd + Debug + Send + Sync + Clone,
{
MonotonicMappingColumn {
from_column,
monotonic_mapping,
_phantom: PhantomData,
}
}
impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input>
where
C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Debug + Sync + Clone,
Output: PartialOrd + Send + Debug + Sync + Clone,
{
#[inline]
fn get_val(&self, idx: u32) -> Output {
let from_val = self.from_column.get_val(idx);
self.monotonic_mapping.mapping(from_val)
}
fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
self.monotonic_mapping.mapping(from_min_value)
}
fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
self.monotonic_mapping.mapping(from_max_value)
}
fn num_vals(&self) -> u32 {
self.from_column.num_vals()
}
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
Box::new(
self.from_column
.iter()
.map(|el| self.monotonic_mapping.mapping(el)),
)
}
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_row_ids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
)
}
// We voluntarily do not implement get_range as it yields a regression,
// and we do not have any specialized implementation anyway.
}
/// Wraps an iterator into a `Column`.
pub struct IterColumn<T>(T);
impl<T> From<T> for IterColumn<T>
where T: Iterator + Clone + ExactSizeIterator
{
fn from(iter: T) -> Self {
IterColumn(iter)
}
}
impl<T> ColumnValues<T::Item> for IterColumn<T>
where
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd + Debug,
{
fn get_val(&self, idx: u32) -> T::Item {
self.0.clone().nth(idx as usize).unwrap()
}
fn min_value(&self) -> T::Item {
self.0.clone().next().unwrap()
}
fn max_value(&self) -> T::Item {
self.0.clone().last().unwrap()
}
fn num_vals(&self) -> u32 {
self.0.len() as u32
}
fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {
Box::new(self.0.clone())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::column_values::monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
StrictlyMonotonicMappingToInternalGCDBaseval,
};
#[test]
fn test_monotonic_mapping() {
let vals = &[3u64, 5u64][..];
let col = VecColumn::from(vals);
let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
assert_eq!(mapped.min_value(), 1u64);
assert_eq!(mapped.max_value(), 3u64);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.get_val(0), 1);
assert_eq!(mapped.get_val(1), 3);
}
#[test]
fn test_range_as_col() {
let col = IterColumn::from(10..100);
assert_eq!(col.num_vals(), 90);
assert_eq!(col.max_value(), 99);
}
#[test]
fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
),
);
let val_i64s: Vec<u64> = mapped.iter().collect();
for i in 0..100 {
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
}
}
#[test]
fn test_monotonic_mapping_get_range() {
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
),
);
assert_eq!(mapped.min_value(), 0u64);
assert_eq!(mapped.max_value(), 9900u64);
assert_eq!(mapped.num_vals(), 100);
let val_u64s: Vec<u64> = mapped.iter().collect();
assert_eq!(val_u64s.len(), 100);
for i in 0..100 {
assert_eq!(val_u64s[i as usize], mapped.get_val(i));
assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
}
let mut buf = [0u64; 20];
mapped.get_range(7, &mut buf[..]);
assert_eq!(&val_u64s[7..][..20], &buf);
}
}

View File

@@ -38,6 +38,6 @@ impl Ord for BlankRange {
}
impl PartialOrd for BlankRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
Some(self.blank_size().cmp(&other.blank_size()))
}
}

View File

@@ -10,7 +10,7 @@ use super::{CompactSpace, RangeMapping};
/// Put the blanks for the sorted values into a binary heap
fn get_blanks(values_sorted: &BTreeSet<u128>) -> BinaryHeap<BlankRange> {
let mut blanks: BinaryHeap<BlankRange> = BinaryHeap::new();
for (first, second) in values_sorted.iter().copied().tuple_windows() {
for (first, second) in values_sorted.iter().tuple_windows() {
// Correctness Overflow: the values are deduped and sorted (BTreeSet property), that means
// there's always space between two values.
let blank_range = first + 1..=second - 1;
@@ -65,12 +65,12 @@ pub fn get_compact_space(
return compact_space_builder.finish();
}
// We start by space that's limited to min_value..=max_value
// Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
let min_value = values_deduped_sorted.iter().next().copied().unwrap_or(0);
let max_value = values_deduped_sorted.iter().last().copied().unwrap_or(0);
let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);
// Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
// We start by space that's limited to min_value..=max_value
let min_value = *values_deduped_sorted.iter().next().unwrap_or(&0);
let max_value = *values_deduped_sorted.iter().last().unwrap_or(&0);
// +1 for null, in case min and max covers the whole space, we are off by one.
let mut amplitude_compact_space = (max_value - min_value).saturating_add(1);
@@ -84,7 +84,6 @@ pub fn get_compact_space(
let mut amplitude_bits: u8 = num_bits(amplitude_compact_space);
let mut blank_collector = BlankCollector::new();
// We will stage blanks until they reduce the compact space by at least 1 bit and then flush
// them if the metadata cost is lower than the total number of saved bits.
// Binary heap to process the gaps by their size
@@ -94,7 +93,6 @@ pub fn get_compact_space(
let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum();
let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum;
let amplitude_new_bits = num_bits(amplitude_new_compact_space);
if amplitude_bits == amplitude_new_bits {
continue;
}
@@ -102,16 +100,7 @@ pub fn get_compact_space(
// TODO: Maybe calculate exact cost of blanks and run this more expensive computation only,
// when amplitude_new_bits changes
let cost = blank_collector.num_staged_blanks() * cost_per_blank;
// We want to end up with a compact space that fits into 32 bits.
// In order to deal with pathological cases, we force the algorithm to keep
// refining the compact space the amplitude bits is lower than 32.
//
// The worst case scenario happens for a large number of u128s regularly
// spread over the full u128 space.
//
// This change will force the algorithm to degenerate into dictionary encoding.
if amplitude_bits <= 32 && cost >= saved_bits {
if cost >= saved_bits {
// Continue here, since although we walk over the blanks by size,
// we can potentially save a lot at the last bits, which are smaller blanks
//
@@ -126,8 +115,6 @@ pub fn get_compact_space(
compact_space_builder.add_blanks(blank_collector.drain().map(|blank| blank.blank_range()));
}
assert!(amplitude_bits <= 32);
// special case, when we don't collected any blanks because:
// * the data is empty (early exit)
// * the algorithm did decide it's not worth the cost, which can be the case for single values
@@ -212,7 +199,7 @@ impl CompactSpaceBuilder {
covered_space.push(0..=0); // empty data case
};
let mut compact_start: u32 = 1; // 0 is reserved for `null`
let mut compact_start: u64 = 1; // 0 is reserved for `null`
let mut ranges_mapping: Vec<RangeMapping> = Vec::with_capacity(covered_space.len());
for cov in covered_space {
let range_mapping = super::RangeMapping {
@@ -231,7 +218,6 @@ impl CompactSpaceBuilder {
#[cfg(test)]
mod tests {
use super::*;
use crate::column_values::u128_based::compact_space::COST_PER_BLANK_IN_BITS;
#[test]
fn test_binary_heap_pop_order() {
@@ -242,11 +228,4 @@ mod tests {
assert_eq!(blanks.pop().unwrap().blank_size(), 101);
assert_eq!(blanks.pop().unwrap().blank_size(), 11);
}
#[test]
fn test_worst_case_scenario() {
let vals: BTreeSet<u128> = (0..8).map(|i| i * ((1u128 << 34) / 8)).collect();
let compact_space = get_compact_space(&vals, vals.len() as u32, COST_PER_BLANK_IN_BITS);
assert!(compact_space.amplitude_compact_space() < u32::MAX as u128);
}
}

View File

@@ -17,16 +17,16 @@ use std::{
ops::{Range, RangeInclusive},
};
mod blank_range;
mod build_compact_space;
use build_compact_space::get_compact_space;
use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
use tantivy_bitpacker::{self, BitPacker, BitUnpacker};
use crate::column_values::compact_space::build_compact_space::get_compact_space;
use crate::column_values::ColumnValues;
use crate::RowId;
mod blank_range;
mod build_compact_space;
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
/// blanks depends on the number of blanks.
///
@@ -42,15 +42,15 @@ pub struct CompactSpace {
#[derive(Debug, Clone, Eq, PartialEq)]
struct RangeMapping {
value_range: RangeInclusive<u128>,
compact_start: u32,
compact_start: u64,
}
impl RangeMapping {
fn range_length(&self) -> u32 {
(self.value_range.end() - self.value_range.start()) as u32 + 1
fn range_length(&self) -> u64 {
(self.value_range.end() - self.value_range.start()) as u64 + 1
}
// The last value of the compact space in this range
fn compact_end(&self) -> u32 {
fn compact_end(&self) -> u64 {
self.compact_start + self.range_length() - 1
}
}
@@ -81,7 +81,7 @@ impl BinarySerializable for CompactSpace {
let num_ranges = VInt::deserialize(reader)?.0;
let mut ranges_mapping: Vec<RangeMapping> = vec![];
let mut value = 0u128;
let mut compact_start = 1u32; // 0 is reserved for `null`
let mut compact_start = 1u64; // 0 is reserved for `null`
for _ in 0..num_ranges {
let blank_delta_start = VIntU128::deserialize(reader)?.0;
value += blank_delta_start;
@@ -122,10 +122,10 @@ impl CompactSpace {
/// Returns either Ok(the value in the compact space) or if it is outside the compact space the
/// Err(position where it would be inserted)
fn u128_to_compact(&self, value: u128) -> Result<u32, usize> {
fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
self.ranges_mapping
.binary_search_by(|probe| {
let value_range: &RangeInclusive<u128> = &probe.value_range;
let value_range = &probe.value_range;
if value < *value_range.start() {
Ordering::Greater
} else if value > *value_range.end() {
@@ -136,13 +136,13 @@ impl CompactSpace {
})
.map(|pos| {
let range_mapping = &self.ranges_mapping[pos];
let pos_in_range: u32 = (value - range_mapping.value_range.start()) as u32;
let pos_in_range = (value - range_mapping.value_range.start()) as u64;
range_mapping.compact_start + pos_in_range
})
}
/// Unpacks a value from compact space u32 to u128 space
fn compact_to_u128(&self, compact: u32) -> u128 {
/// Unpacks a value from compact space u64 to u128 space
fn compact_to_u128(&self, compact: u64) -> u128 {
let pos = self
.ranges_mapping
.binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start)
@@ -178,15 +178,11 @@ impl CompactSpaceCompressor {
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
pub fn train_from(iter: impl Iterator<Item = u128>) -> Self {
let mut values_sorted = BTreeSet::new();
// Total number of values, with their redundancy.
let mut total_num_values = 0u32;
for val in iter {
total_num_values += 1u32;
values_sorted.insert(val);
}
let min_value = *values_sorted.iter().next().unwrap_or(&0);
let max_value = *values_sorted.iter().last().unwrap_or(&0);
let compact_space =
get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
let amplitude_compact_space = compact_space.amplitude_compact_space();
@@ -197,12 +193,13 @@ impl CompactSpaceCompressor {
);
let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64);
let min_value = *values_sorted.iter().next().unwrap_or(&0);
let max_value = *values_sorted.iter().last().unwrap_or(&0);
assert_eq!(
compact_space
.u128_to_compact(max_value)
.expect("could not convert max value to compact space"),
amplitude_compact_space as u32
amplitude_compact_space as u64
);
CompactSpaceCompressor {
params: IPCodecParams {
@@ -243,7 +240,7 @@ impl CompactSpaceCompressor {
"Could not convert value to compact_space. This is a bug.",
)
})?;
bitpacker.write(compact as u64, self.params.num_bits, write)?;
bitpacker.write(compact, self.params.num_bits, write)?;
}
bitpacker.close(write)?;
self.write_footer(write)?;
@@ -317,6 +314,48 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u128>,
positions_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.get_positions_for_value_range(value_range, positions_range, positions)
}
}
impl CompactSpaceDecompressor {
pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;
let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
let decompressor = CompactSpaceDecompressor { data, params };
Ok(decompressor)
}
/// Converting to compact space for the decompressor is more complex, since we may get values
/// which are outside the compact space. e.g. if we map
/// 1000 => 5
/// 2000 => 6
///
/// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
/// error with the index of the next range.
fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
self.params.compact_space.u128_to_compact(value)
}
fn compact_to_u128(&self, compact: u64) -> u128 {
self.params.compact_space.compact_to_u128(compact)
}
/// Comparing on compact space: Random dataset 0,24 (50% random hit) - 1.05 GElements/s
/// Comparing on compact space: Real dataset 1.08 GElements/s
///
/// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
#[inline]
pub fn get_positions_for_value_range(
&self,
value_range: RangeInclusive<u128>,
position_range: Range<u32>,
@@ -356,42 +395,44 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
range_mapping.compact_end()
});
let value_range = compact_from..=compact_to;
self.get_positions_for_compact_value_range(value_range, position_range, positions);
}
}
let range = compact_from..=compact_to;
impl CompactSpaceDecompressor {
pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;
let scan_num_docs = position_range.end - position_range.start;
let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
let decompressor = CompactSpaceDecompressor { data, params };
let step_size = 4;
let cutoff = position_range.start + scan_num_docs - scan_num_docs % step_size;
Ok(decompressor)
}
let mut push_if_in_range = |idx, val| {
if range.contains(&val) {
positions.push(idx);
}
};
let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data);
// unrolled loop
for idx in (position_range.start..cutoff).step_by(step_size as usize) {
let idx1 = idx;
let idx2 = idx + 1;
let idx3 = idx + 2;
let idx4 = idx + 3;
let val1 = get_val(idx1);
let val2 = get_val(idx2);
let val3 = get_val(idx3);
let val4 = get_val(idx4);
push_if_in_range(idx1, val1);
push_if_in_range(idx2, val2);
push_if_in_range(idx3, val3);
push_if_in_range(idx4, val4);
}
/// Converting to compact space for the decompressor is more complex, since we may get values
/// which are outside the compact space. e.g. if we map
/// 1000 => 5
/// 2000 => 6
///
/// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
/// error with the index of the next range.
fn u128_to_compact(&self, value: u128) -> Result<u32, usize> {
self.params.compact_space.u128_to_compact(value)
}
fn compact_to_u128(&self, compact: u32) -> u128 {
self.params.compact_space.compact_to_u128(compact)
// handle rest
for idx in cutoff..position_range.end {
push_if_in_range(idx, get_val(idx));
}
}
#[inline]
fn iter_compact(&self) -> impl Iterator<Item = u32> + '_ {
(0..self.params.num_vals)
.map(move |idx| self.params.bit_unpacker.get(idx, &self.data) as u32)
fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
(0..self.params.num_vals).map(move |idx| self.params.bit_unpacker.get(idx, &self.data))
}
#[inline]
@@ -404,7 +445,7 @@ impl CompactSpaceDecompressor {
#[inline]
pub fn get(&self, idx: u32) -> u128 {
let compact = self.params.bit_unpacker.get(idx, &self.data) as u32;
let compact = self.params.bit_unpacker.get(idx, &self.data);
self.compact_to_u128(compact)
}
@@ -415,20 +456,6 @@ impl CompactSpaceDecompressor {
pub fn max_value(&self) -> u128 {
self.params.max_value
}
fn get_positions_for_compact_value_range(
&self,
value_range: RangeInclusive<u32>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.params.bit_unpacker.get_ids_for_value_range(
*value_range.start() as u64..=*value_range.end() as u64,
position_range,
&self.data,
positions,
);
}
}
#[cfg(test)]
@@ -437,17 +464,17 @@ mod tests {
use itertools::Itertools;
use super::*;
use crate::column_values::u128_based::U128Header;
use crate::column_values::serialize::U128Header;
use crate::column_values::{open_u128_mapped, serialize_column_values_u128};
#[test]
fn compact_space_test() {
let ips: BTreeSet<u128> = [
let ips = &[
2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260,
]
.into_iter()
.collect();
let compact_space = get_compact_space(&ips, ips.len() as u32, 11);
let compact_space = get_compact_space(ips, ips.len() as u32, 11);
let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 17);
assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
@@ -470,8 +497,8 @@ mod tests {
);
for ip in ips {
let compact = compact_space.u128_to_compact(ip).unwrap();
assert_eq!(compact_space.compact_to_u128(compact), ip);
let compact = compact_space.u128_to_compact(*ip).unwrap();
assert_eq!(compact_space.compact_to_u128(compact), *ip);
}
}
@@ -497,7 +524,7 @@ mod tests {
.map(|pos| pos as u32)
.collect::<Vec<_>>();
let mut positions = Vec::new();
decompressor.get_row_ids_for_value_range(
decompressor.get_positions_for_value_range(
range,
0..decompressor.num_vals(),
&mut positions,
@@ -542,7 +569,7 @@ mod tests {
let val = *val;
let pos = pos as u32;
let mut positions = Vec::new();
decomp.get_row_ids_for_value_range(val..=val, pos..pos + 1, &mut positions);
decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
assert_eq!(positions, vec![pos]);
}

View File

@@ -1,40 +0,0 @@
use std::fmt::Debug;
use std::sync::Arc;
use crate::iterable::Iterable;
use crate::{ColumnIndex, ColumnValues, MergeRowOrder};
pub(crate) struct MergedColumnValues<'a, T> {
pub(crate) column_indexes: &'a [ColumnIndex],
pub(crate) column_values: &'a [Option<Arc<dyn ColumnValues<T>>>],
pub(crate) merge_row_order: &'a MergeRowOrder,
}
impl<'a, T: Copy + PartialOrd + Debug> Iterable<T> for MergedColumnValues<'a, T> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
match self.merge_row_order {
MergeRowOrder::Stack(_) => Box::new(
self.column_values
.iter()
.flatten()
.flat_map(|column_value| column_value.iter()),
),
MergeRowOrder::Shuffled(shuffle_merge_order) => Box::new(
shuffle_merge_order
.iter_new_to_old_row_addrs()
.flat_map(|row_addr| {
let column_index = &self.column_indexes[row_addr.segment_ord as usize];
let column_values =
self.column_values[row_addr.segment_ord as usize].as_ref()?;
let value_range = column_index.value_row_ids(row_addr.row_id);
Some((value_range, column_values))
})
.flat_map(|(value_range, column_values)| {
value_range
.into_iter()
.map(|val| column_values.get_val(val))
}),
),
}
}
}

View File

@@ -2,207 +2,265 @@
//! # `fastfield_codecs`
//!
//! - Columnar storage of data for tantivy [`crate::Column`].
//! - Columnar storage of data for tantivy [`Column`].
//! - Encode data in different codecs.
//! - Monotonically map values to u64/u128
use std::fmt::Debug;
use std::ops::{Range, RangeInclusive};
use std::io;
use std::io::Write;
use std::sync::Arc;
use common::{BinarySerializable, OwnedBytes};
use compact_space::CompactSpaceDecompressor;
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
use monotonic_mapping::{StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal};
pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
use serialize::U128Header;
mod merge;
mod compact_space;
pub(crate) mod monotonic_mapping;
pub(crate) mod monotonic_mapping_u128;
mod stats;
mod u128_based;
mod u64_based;
mod vec_column;
pub(crate) mod u64_based;
mod monotonic_column;
mod column;
pub(crate) mod serialize;
pub(crate) use merge::MergedColumnValues;
pub use serialize::serialize_column_values_u128;
pub use stats::ColumnStats;
pub use u128_based::{open_u128_mapped, serialize_column_values_u128};
pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
};
pub use vec_column::VecColumn;
pub use self::monotonic_column::monotonic_map_column;
use crate::RowId;
pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn};
use crate::iterable::Iterable;
use crate::{ColumnIndex, MergeRowOrder};
/// `ColumnValues` provides access to a dense field column.
///
/// `Column` are just a wrapper over `ColumnValues` and a `ColumnIndex`.
///
/// Any methods with a default and specialized implementation need to be called in the
/// wrappers that implement the trait: Arc and MonotonicMappingColumn
pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T;
pub(crate) struct MergedColumnValues<'a, T> {
pub(crate) column_indexes: &'a [Option<ColumnIndex>],
pub(crate) column_values: &'a [Option<Arc<dyn ColumnValues<T>>>],
pub(crate) merge_row_order: &'a MergeRowOrder,
}
/// Allows to push down multiple fetch calls, to avoid dynamic dispatch overhead.
///
/// idx and output should have the same length
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_vals(&self, indexes: &[u32], output: &mut [T]) {
assert!(indexes.len() == output.len());
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
out_x4[0] = self.get_val(idx_x4[0]);
out_x4[1] = self.get_val(idx_x4[1]);
out_x4[2] = self.get_val(idx_x4[2]);
out_x4[3] = self.get_val(idx_x4[3]);
impl<'a, T: Copy + PartialOrd + Debug> Iterable<T> for MergedColumnValues<'a, T> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
match self.merge_row_order {
MergeRowOrder::Stack(_) => {
Box::new(self
.column_values
.iter()
.flatten()
.flat_map(|column_value| column_value.iter()))
},
MergeRowOrder::Shuffled(shuffle_merge_order) => {
Box::new(shuffle_merge_order
.iter_new_to_old_row_addrs()
.flat_map(|row_addr| {
let Some(column_index) = self.column_indexes[row_addr.segment_ord as usize].as_ref() else {
return None;
};
let Some(column_values) = self.column_values[row_addr.segment_ord as usize].as_ref() else {
return None;
};
let value_range = column_index.value_row_ids(row_addr.row_id);
Some((value_range, column_values))
})
.flat_map(|(value_range, column_values)| {
value_range
.into_iter()
.map(|val| column_values.get_val(val))
})
)
},
}
let step_size = 4;
let cutoff = indexes.len() - indexes.len() % step_size;
for idx in cutoff..indexes.len() {
output[idx] = self.get_val(indexes[idx]);
}
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// Must panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
#[inline(always)]
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32);
}
}
/// Get the row ids of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range.start..row_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
row_id_hits.push(idx);
}
}
}
/// Returns a lower bound for this column of values.
///
/// All values are guaranteed to be higher than `.min_value()`
/// but this value is not necessary the best boundary value.
///
/// We have
/// ∀i < self.num_vals(), self.get_val(i) >= self.min_value()
/// But we don't have necessarily
/// ∃i < self.num_vals(), self.get_val(i) == self.min_value()
fn min_value(&self) -> T;
/// Returns an upper bound for this column of values.
///
/// All values are guaranteed to be lower than `.max_value()`
/// but this value is not necessary the best boundary value.
///
/// We have
/// ∀i < self.num_vals(), self.get_val(i) <= self.max_value()
/// But we don't have necessarily
/// ∃i < self.num_vals(), self.get_val(i) == self.max_value()
fn max_value(&self) -> T;
/// The number of values in the column.
fn num_vals(&self) -> u32;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
/// Empty column of values.
pub struct EmptyColumnValues;
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
pub enum U128FastFieldCodecType {
/// This codec takes a large number space (u128) and reduces it to a compact number space, by
/// removing the holes.
CompactSpace = 1,
}
impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
fn get_val(&self, _idx: u32) -> T {
panic!("Internal Error: Called get_val of empty column.")
impl BinarySerializable for U128FastFieldCodecType {
fn serialize<W: Write + ?Sized>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
fn min_value(&self) -> T {
T::default()
}
fn max_value(&self) -> T {
T::default()
}
fn num_vals(&self) -> u32 {
0
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
#[inline(always)]
fn get_val(&self, idx: u32) -> T {
self.as_ref().get_val(idx)
impl U128FastFieldCodecType {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
#[inline(always)]
fn min_value(&self) -> T {
self.as_ref().min_value()
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::CompactSpace),
_ => None,
}
}
}
#[inline(always)]
fn max_value(&self) -> T {
self.as_ref().max_value()
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128_mapped<T: MonotonicallyMappableToU128 + Debug>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn ColumnValues<T>>> {
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceDecompressor::open(bytes)?;
#[inline(always)]
fn num_vals(&self) -> u32 {
self.as_ref().num_vals()
}
#[inline(always)]
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
self.as_ref().iter()
}
#[inline(always)]
fn get_range(&self, start: u64, output: &mut [T]) {
self.as_ref().get_range(start, output)
}
#[inline(always)]
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.as_ref()
.get_row_ids_for_value_range(range, doc_id_range, positions)
}
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<T>> =
StrictlyMonotonicMappingToInternal::<T>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted)))
}
#[cfg(all(test, feature = "unstable"))]
mod bench;
mod bench {
use std::sync::Arc;
use common::OwnedBytes;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::{self, Bencher};
use super::*;
use crate::column_values::u64_based::*;
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55000_u64)
.map(|num| num + rng.gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default();
for val in vals {
stats_collector.collect(val);
}
stats_collector.stats()
}
#[inline(never)]
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
let mut codec_serializer = Codec::estimator();
for val in data {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
Codec::load(OwnedBytes::new(bytes)).unwrap()
}
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = get_reader_for_bench::<Codec>(data);
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
#[inline(never)]
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = Arc::new(get_reader_for_bench::<Codec>(data));
bench_get_dynamic_helper(b, col);
}
fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let stats = compute_stats(data.iter().cloned());
let mut bytes = Vec::new();
b.iter(|| {
bytes.clear();
let mut codec_serializer = Codec::estimator();
for val in data.iter().take(1024) {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
});
}
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
}
}

View File

@@ -1,120 +0,0 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
use crate::ColumnValues;
struct MonotonicMappingColumn<C, T, Input> {
from_column: C,
monotonic_mapping: T,
_phantom: PhantomData<Input>,
}
/// Creates a view of a column transformed by a strictly monotonic mapping. See
/// [`StrictlyMonotonicFn`].
///
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
/// monotonic_mapping.mapping() is expected to be injective, and we should always have
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
///
/// The inverse of the mapping is required for:
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column.
///
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
/// monotonic_mapping during serialization.
pub fn monotonic_map_column<C, T, Input, Output>(
from_column: C,
monotonic_mapping: T,
) -> impl ColumnValues<Output>
where
C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Debug + Send + Sync + Clone,
Output: PartialOrd + Debug + Send + Sync + Clone,
{
MonotonicMappingColumn {
from_column,
monotonic_mapping,
_phantom: PhantomData,
}
}
impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input>
where
C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Debug + Sync + Clone,
Output: PartialOrd + Send + Debug + Sync + Clone,
{
#[inline(always)]
fn get_val(&self, idx: u32) -> Output {
let from_val = self.from_column.get_val(idx);
self.monotonic_mapping.mapping(from_val)
}
fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
self.monotonic_mapping.mapping(from_min_value)
}
fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
self.monotonic_mapping.mapping(from_max_value)
}
fn num_vals(&self) -> u32 {
self.from_column.num_vals()
}
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
Box::new(
self.from_column
.iter()
.map(|el| self.monotonic_mapping.mapping(el)),
)
}
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_row_ids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
)
}
// We voluntarily do not implement get_range as it yields a regression,
// and we do not have any specialized implementation anyway.
}
#[cfg(test)]
mod tests {
use super::*;
use crate::column_values::monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
};
use crate::column_values::VecColumn;
#[test]
fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<i64>::new()),
);
let val_i64s: Vec<u64> = mapped.iter().collect();
for i in 0..100 {
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
}
}
}

View File

@@ -2,6 +2,7 @@ use std::fmt::Debug;
use std::marker::PhantomData;
use common::DateTime;
use fastdivide::DividerU64;
use super::MonotonicallyMappableToU128;
use crate::RowId;
@@ -112,6 +113,68 @@ where T: MonotonicallyMappableToU64
}
}
/// Mapping dividing by gcd and a base value.
///
/// The function is assumed to be only called on values divided by passed
/// gcd value. (It is necessary for the function to be monotonic.)
pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval {
gcd_divider: DividerU64,
gcd: u64,
min_value: u64,
}
impl StrictlyMonotonicMappingToInternalGCDBaseval {
/// Creates a linear mapping `x -> gcd*x + min_value`.
pub(crate) fn new(gcd: u64, min_value: u64) -> Self {
let gcd_divider = DividerU64::divide_by(gcd);
Self {
gcd_divider,
gcd,
min_value,
}
}
}
impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
for StrictlyMonotonicMappingToInternalGCDBaseval
{
#[inline(always)]
fn mapping(&self, inp: External) -> u64 {
self.gcd_divider
.divide(External::to_u64(inp) - self.min_value)
}
#[inline(always)]
fn inverse(&self, out: u64) -> External {
External::from_u64(self.min_value + out * self.gcd)
}
}
/// Strictly monotonic mapping with a base value.
pub(crate) struct StrictlyMonotonicMappingToInternalBaseval {
min_value: u64,
}
impl StrictlyMonotonicMappingToInternalBaseval {
/// Creates a linear mapping `x -> x + min_value`.
#[inline(always)]
pub(crate) fn new(min_value: u64) -> Self {
Self { min_value }
}
}
impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
for StrictlyMonotonicMappingToInternalBaseval
{
#[inline(always)]
fn mapping(&self, val: External) -> u64 {
External::to_u64(val) - self.min_value
}
#[inline(always)]
fn inverse(&self, val: u64) -> External {
External::from_u64(self.min_value + val)
}
}
impl MonotonicallyMappableToU64 for u64 {
#[inline(always)]
fn to_u64(self) -> u64 {
@@ -139,12 +202,12 @@ impl MonotonicallyMappableToU64 for i64 {
impl MonotonicallyMappableToU64 for DateTime {
#[inline(always)]
fn to_u64(self) -> u64 {
common::i64_to_u64(self.into_timestamp_nanos())
common::i64_to_u64(self.into_timestamp_micros())
}
#[inline(always)]
fn from_u64(val: u64) -> Self {
DateTime::from_timestamp_nanos(common::u64_to_i64(val))
DateTime::from_timestamp_micros(common::u64_to_i64(val))
}
}
@@ -200,6 +263,13 @@ mod tests {
// TODO
// identity mapping
// test_round_trip(&StrictlyMonotonicMappingToInternal::<u128>::new(), 100u128);
// base value to i64 round trip
let mapping = StrictlyMonotonicMappingToInternalBaseval::new(100);
test_round_trip::<_, _, u64>(&mapping, 100i64);
// base value and gcd to u64 round trip
let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100);
test_round_trip::<_, _, u64>(&mapping, 100u64);
}
fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L>(

View File

@@ -1,19 +1,12 @@
use std::fmt::Debug;
use std::io;
use std::io::Write;
use std::sync::Arc;
mod compact_space;
use common::{BinarySerializable, VInt};
use common::{BinarySerializable, OwnedBytes, VInt};
use compact_space::{CompactSpaceCompressor, CompactSpaceDecompressor};
use crate::column_values::monotonic_map_column;
use crate::column_values::monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
};
use crate::column_values::compact_space::CompactSpaceCompressor;
use crate::column_values::U128FastFieldCodecType;
use crate::iterable::Iterable;
use crate::{ColumnValues, MonotonicallyMappableToU128};
use crate::MonotonicallyMappableToU128;
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) struct U128Header {
@@ -62,52 +55,6 @@ pub fn serialize_column_values_u128<T: MonotonicallyMappableToU128>(
Ok(())
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
pub(crate) enum U128FastFieldCodecType {
/// This codec takes a large number space (u128) and reduces it to a compact number space, by
/// removing the holes.
CompactSpace = 1,
}
impl BinarySerializable for U128FastFieldCodecType {
fn serialize<W: Write + ?Sized>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
impl U128FastFieldCodecType {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::CompactSpace),
_ => None,
}
}
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128_mapped<T: MonotonicallyMappableToU128 + Debug>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn ColumnValues<T>>> {
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceDecompressor::open(bytes)?;
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<T>> =
StrictlyMonotonicMappingToInternal::<T>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted)))
}
#[cfg(test)]
pub mod tests {
use super::*;

View File

@@ -1,6 +1,4 @@
use std::io::{self, Write};
use std::num::NonZeroU64;
use std::ops::{Range, RangeInclusive};
use common::{BinarySerializable, OwnedBytes};
use fastdivide::DividerU64;
@@ -18,46 +16,6 @@ pub struct BitpackedReader {
stats: ColumnStats,
}
#[inline(always)]
const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
// copied from unstable rust standard library.
let d = n / q.get();
let r = n % q.get();
if r > 0 {
d + 1
} else {
d
}
}
// The bitpacked codec applies a linear transformation `f` over data that are bitpacked.
// f is defined by:
// f: bitpacked -> stats.min_value + stats.gcd * bitpacked
//
// In order to run range queries, we invert the transformation.
// `transform_range_before_linear_transformation` returns the range of values
// [min_bipacked_value..max_bitpacked_value] such that
// f(bitpacked) ∈ [min_value, max_value] <=> bitpacked ∈ [min_bitpacked_value, max_bitpacked_value]
fn transform_range_before_linear_transformation(
stats: &ColumnStats,
range: RangeInclusive<u64>,
) -> Option<RangeInclusive<u64>> {
if range.is_empty() {
return None;
}
if stats.min_value > *range.end() {
return None;
}
if stats.max_value < *range.start() {
return None;
}
let shifted_range =
range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);
let end_before_gcd_multiplication: u64 = *shifted_range.end() / stats.gcd;
Some(start_before_gcd_multiplication..=end_before_gcd_multiplication)
}
impl ColumnValues for BitpackedReader {
#[inline(always)]
fn get_val(&self, doc: u32) -> u64 {
@@ -76,26 +34,6 @@ impl ColumnValues for BitpackedReader {
fn num_vals(&self) -> RowId {
self.stats.num_rows
}
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<u64>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;
};
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
}
}
fn num_bits(stats: &ColumnStats) -> u8 {

View File

@@ -201,8 +201,8 @@ pub struct BlockwiseLinearReader {
impl ColumnValues for BlockwiseLinearReader {
#[inline(always)]
fn get_val(&self, idx: u32) -> u64 {
let block_id = (idx / BLOCK_SIZE) as usize;
let idx_within_block = idx % BLOCK_SIZE;
let block_id = (idx / BLOCK_SIZE as u32) as usize;
let idx_within_block = idx % (BLOCK_SIZE as u32);
let block = &self.blocks[block_id];
let interpoled_val: u64 = block.line.eval(idx_within_block);
let block_bytes = &self.data[block.data_start_offset..];

View File

@@ -27,7 +27,7 @@ pub struct StatsCollector {
// This is the same as computing the difference between the values and the first value.
//
// This way, we can compress i64-converted-to-u64 (e.g. timestamp that were supplied in
// seconds, only to be converted in nanoseconds).
// seconds, only to be converted in microseconds).
increment_gcd_opt: Option<(NonZeroU64, DividerU64)>,
first_value_opt: Option<u64>,
}

View File

@@ -19,62 +19,6 @@ fn test_serialize_and_load_simple() {
assert_eq!(col.get_val(1), 2);
assert_eq!(col.get_val(2), 5);
}
#[test]
fn test_empty_column_i64() {
let vals: [i64; 0] = [];
let mut num_acceptable_codecs = 0;
for codec in ALL_U64_CODEC_TYPES {
let mut buffer = Vec::new();
if serialize_u64_based_column_values(&&vals[..], &[codec], &mut buffer).is_err() {
continue;
}
num_acceptable_codecs += 1;
let col = load_u64_based_column_values::<i64>(OwnedBytes::new(buffer)).unwrap();
assert_eq!(col.num_vals(), 0);
assert_eq!(col.min_value(), i64::MIN);
assert_eq!(col.max_value(), i64::MIN);
}
assert!(num_acceptable_codecs > 0);
}
#[test]
fn test_empty_column_u64() {
let vals: [u64; 0] = [];
let mut num_acceptable_codecs = 0;
for codec in ALL_U64_CODEC_TYPES {
let mut buffer = Vec::new();
if serialize_u64_based_column_values(&&vals[..], &[codec], &mut buffer).is_err() {
continue;
}
num_acceptable_codecs += 1;
let col = load_u64_based_column_values::<u64>(OwnedBytes::new(buffer)).unwrap();
assert_eq!(col.num_vals(), 0);
assert_eq!(col.min_value(), u64::MIN);
assert_eq!(col.max_value(), u64::MIN);
}
assert!(num_acceptable_codecs > 0);
}
#[test]
fn test_empty_column_f64() {
let vals: [f64; 0] = [];
let mut num_acceptable_codecs = 0;
for codec in ALL_U64_CODEC_TYPES {
let mut buffer = Vec::new();
if serialize_u64_based_column_values(&&vals[..], &[codec], &mut buffer).is_err() {
continue;
}
num_acceptable_codecs += 1;
let col = load_u64_based_column_values::<f64>(OwnedBytes::new(buffer)).unwrap();
assert_eq!(col.num_vals(), 0);
// FIXME. f64::MIN would be better!
assert!(col.min_value().is_nan());
assert!(col.max_value().is_nan());
}
assert!(num_acceptable_codecs > 0);
}
pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
vals: &[u64],
name: &str,
@@ -99,28 +43,14 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
let reader = TColumnCodec::load(OwnedBytes::new(buffer)).unwrap();
assert_eq!(reader.num_vals(), vals.len() as u32);
let mut buffer = Vec::new();
for (doc, orig_val) in vals.iter().copied().enumerate() {
let val = reader.get_val(doc as u32);
assert_eq!(
val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data `{vals:?}`",
);
buffer.resize(1, 0);
reader.get_vals(&[doc as u32], &mut buffer);
let val = buffer[0];
assert_eq!(
val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data `{vals:?}`",
);
}
let all_docs: Vec<u32> = (0..vals.len() as u32).collect();
buffer.resize(all_docs.len(), 0);
reader.get_vals(&all_docs, &mut buffer);
assert_eq!(vals, buffer);
if !vals.is_empty() {
let test_rand_idx = rand::thread_rng().gen_range(0..=vals.len() - 1);
let expected_positions: Vec<u32> = vals

View File

@@ -1,52 +0,0 @@
use std::fmt::Debug;
use tantivy_bitpacker::minmax;
use crate::ColumnValues;
/// VecColumn provides `Column` over a slice.
pub struct VecColumn<'a, T = u64> {
pub(crate) values: &'a [T],
pub(crate) min_value: T,
pub(crate) max_value: T,
}
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> ColumnValues<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]
}
fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
Box::new(self.values.iter().copied())
}
fn min_value(&self) -> T {
self.min_value
}
fn max_value(&self) -> T {
self.max_value
}
fn num_vals(&self) -> u32 {
self.values.len() as u32
}
fn get_range(&self, start: u64, output: &mut [T]) {
output.copy_from_slice(&self.values[start as usize..][..output.len()])
}
}
impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
where V: AsRef<[T]> + ?Sized
{
fn from(values: &'a V) -> Self {
let values = values.as_ref();
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
Self {
values,
min_value,
max_value,
}
}
}

View File

@@ -1,15 +1,12 @@
use std::fmt;
use std::fmt::Debug;
use std::net::Ipv6Addr;
use serde::{Deserialize, Serialize};
use crate::value::NumericalType;
use crate::InvalidData;
/// The column type represents the column type.
/// Any changes need to be propagated to `COLUMN_TYPES`.
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd, Serialize, Deserialize)]
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd)]
#[repr(u8)]
pub enum ColumnType {
I64 = 0u8,
@@ -22,22 +19,6 @@ pub enum ColumnType {
DateTime = 7u8,
}
impl fmt::Display for ColumnType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let short_str = match self {
ColumnType::I64 => "i64",
ColumnType::U64 => "u64",
ColumnType::F64 => "f64",
ColumnType::Bytes => "bytes",
ColumnType::Str => "str",
ColumnType::Bool => "bool",
ColumnType::IpAddr => "ip",
ColumnType::DateTime => "datetime",
};
write!(f, "{short_str}")
}
}
// The order needs to match _exactly_ the order in the enum
const COLUMN_TYPES: [ColumnType; 8] = [
ColumnType::I64,
@@ -54,9 +35,6 @@ impl ColumnType {
pub fn to_code(self) -> u8 {
self as u8
}
pub fn is_date_time(&self) -> bool {
self == &ColumnType::DateTime
}
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
@@ -165,7 +143,7 @@ mod tests {
}
}
for code in COLUMN_TYPES.len() as u8..=u8::MAX {
assert!(ColumnType::try_from_code(code).is_err());
assert!(ColumnType::try_from_code(code as u8).is_err());
}
}

View File

@@ -1,7 +1,7 @@
use std::io::{self, Write};
use common::{BitSet, CountingWriter, ReadOnlyBitSet};
use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
use sstable::{SSTable, TermOrdinal};
use super::term_merger::TermMerger;
use crate::column::serialize_column_mappable_to_u64;
@@ -56,19 +56,17 @@ impl<'a> RemappedTermOrdinalsValues<'a> {
.bytes_columns
.iter()
.enumerate()
.flat_map(|(seg_ord, bytes_column_opt)| {
let bytes_column = bytes_column_opt.as_ref()?;
Some((seg_ord, bytes_column))
})
.flat_map(move |(seg_ord, bytes_column)| {
let term_ord_after_merge_mapping =
self.term_ord_mapping.get_segment(seg_ord as u32);
bytes_column
.ords()
.values
.iter()
.map(move |term_ord| term_ord_after_merge_mapping[term_ord as usize])
.flat_map(|(segment_ord, byte_column)| {
let segment_ord = self.term_ord_mapping.get_segment(segment_ord as u32);
byte_column.iter().flat_map(move |bytes_column| {
bytes_column
.ords()
.values
.iter()
.map(move |term_ord| segment_ord[term_ord as usize])
})
});
// TODO see if we can better decompose the mapping / and the stacking
Box::new(iter)
}
@@ -98,7 +96,7 @@ fn compute_term_bitset(column: &BytesColumn, row_bitset: &ReadOnlyBitSet) -> Bit
let num_terms = column.dictionary().num_terms();
let mut term_bitset = BitSet::with_max_value(num_terms as u32);
for row_id in row_bitset.iter() {
for term_ord in column.term_ord_column.values_for_doc(row_id) {
for term_ord in column.term_ord_column.values(row_id) {
term_bitset.insert(term_ord as u32);
}
}
@@ -126,20 +124,16 @@ fn serialize_merged_dict(
let mut term_ord_mapping = TermOrdinalMapping::default();
let mut field_term_streams = Vec::new();
for column_opt in bytes_columns.iter() {
if let Some(column) = column_opt {
term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
field_term_streams.push(terms);
} else {
term_ord_mapping.add_segment(0);
field_term_streams.push(Streamer::empty());
}
for column in bytes_columns.iter().flatten() {
term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms = column.dictionary.stream()?;
field_term_streams.push(terms);
}
let mut merged_terms = TermMerger::new(field_term_streams);
let mut sstable_builder = sstable::VoidSSTable::writer(output);
// TODO support complex `merge_row_order`.
match merge_row_order {
MergeRowOrder::Stack(_) => {
let mut current_term_ord = 0;
@@ -197,7 +191,7 @@ struct TermOrdinalMapping {
impl TermOrdinalMapping {
fn add_segment(&mut self, max_term_ord: usize) {
self.per_segment_new_term_ordinals
.push(vec![TermOrdinal::default(); max_term_ord]);
.push(vec![TermOrdinal::default(); max_term_ord as usize]);
}
fn register_from_to(&mut self, segment_ord: usize, from_ord: TermOrdinal, to_ord: TermOrdinal) {

View File

@@ -11,17 +11,6 @@ pub struct StackMergeOrder {
}
impl StackMergeOrder {
#[cfg(test)]
pub fn stack_for_test(num_rows_per_columnar: &[u32]) -> StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(num_rows_per_columnar.len());
let mut cumulated_row_id = 0;
for &num_rows in num_rows_per_columnar {
cumulated_row_id += num_rows;
cumulated_row_ids.push(cumulated_row_id);
}
StackMergeOrder { cumulated_row_ids }
}
pub fn stack(columnars: &[&ColumnarReader]) -> StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
let mut cumulated_row_id = 0;
@@ -52,8 +41,8 @@ pub enum MergeRowOrder {
/// Columnar tables are simply stacked one above the other.
/// If the i-th columnar_readers has n_rows_i rows, then
/// in the resulting columnar,
/// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
/// ..
/// No documents is deleted.
Stack(StackMergeOrder),

View File

@@ -2,12 +2,13 @@ mod merge_dict_column;
mod merge_mapping;
mod term_merger;
use std::collections::{BTreeMap, HashSet};
// mod sorted_doc_id_column;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use itertools::Itertools;
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
use super::writer::ColumnarSerializer;
@@ -18,8 +19,7 @@ use crate::columnar::writer::CompatibleNumericalTypes;
use crate::columnar::ColumnarReader;
use crate::dynamic_column::DynamicColumn;
use crate::{
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
NumericalValue,
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, NumericalType, NumericalValue,
};
/// Column types are grouped into different categories.
@@ -29,16 +29,14 @@ use crate::{
/// In practise, today, only Numerical colummns are coerced into one type today.
///
/// See also [README.md].
///
/// The ordering has to match the ordering of the variants in [ColumnType].
#[derive(Copy, Clone, Eq, PartialOrd, Ord, PartialEq, Hash, Debug)]
pub(crate) enum ColumnTypeCategory {
Numerical,
Bytes,
Str,
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
enum ColumnTypeCategory {
Bool,
IpAddr,
Str,
Numerical,
DateTime,
Bytes,
IpAddr,
}
impl From<ColumnType> for ColumnTypeCategory {
@@ -56,63 +54,26 @@ impl From<ColumnType> for ColumnTypeCategory {
}
}
/// Merge several columnar table together.
///
/// If several columns with the same name are conflicting with the numerical types in the
/// input columnars, the first type compatible out of i64, u64, f64 in that order will be used.
///
/// `require_columns` makes it possible to ensure that some columns will be present in the
/// resulting columnar. When a required column is a numerical column type, one of two things can
/// happen:
/// - If the required column type is compatible with all of the input columnar, the resulsting
/// merged
/// columnar will simply coerce the input column and use the required column type.
/// - If the required column type is incompatible with one of the input columnar, the merged
/// will fail with an InvalidData error.
///
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
/// `Columnar` table.
///
/// Reminder: a string and a numerical column may bare the same column name. This is not
/// considered a conflict.
pub fn merge_columnar(
columnar_readers: &[&ColumnarReader],
required_columns: &[(String, ColumnType)],
merge_row_order: MergeRowOrder,
output: &mut impl io::Write,
) -> io::Result<()> {
let mut serializer = ColumnarSerializer::new(output);
let num_rows_per_columnar = columnar_readers
.iter()
.map(|reader| reader.num_rows())
.collect::<Vec<u32>>();
let columns_to_merge =
group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
for res in columns_to_merge {
let ((column_name, _column_type_category), grouped_columns) = res;
let grouped_columns = grouped_columns.open(&merge_row_order)?;
if grouped_columns.is_empty() {
continue;
}
let column_type = grouped_columns.column_type_after_merge();
let mut columns = grouped_columns.columns;
coerce_columns(column_type, &mut columns)?;
let columns_to_merge = group_columns_for_merge(columnar_readers)?;
for ((column_name, column_type), columns) in columns_to_merge {
let mut column_serializer =
serializer.start_serialize_column(column_name.as_bytes(), column_type);
serializer.serialize_column(column_name.as_bytes(), column_type);
merge_column(
column_type,
&num_rows_per_columnar,
columns,
&merge_row_order,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
serializer.finalize(merge_row_order.num_rows())?;
Ok(())
}
@@ -129,7 +90,6 @@ fn dynamic_column_to_u64_monotonic(dynamic_column: DynamicColumn) -> Option<Colu
fn merge_column(
column_type: ColumnType,
num_docs_per_column: &[u32],
columns: Vec<Option<DynamicColumn>>,
merge_row_order: &MergeRowOrder,
wrt: &mut impl io::Write,
@@ -140,19 +100,17 @@ fn merge_column(
| ColumnType::F64
| ColumnType::DateTime
| ColumnType::Bool => {
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
let mut column_indexes: Vec<Option<ColumnIndex>> = Vec::with_capacity(columns.len());
let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
Vec::with_capacity(columns.len());
for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
if let Some(Column { index: idx, values }) =
for dynamic_column_opt in columns {
if let Some(Column { idx, values }) =
dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
{
column_indexes.push(idx);
column_indexes.push(Some(idx));
column_values.push(Some(values));
} else {
column_indexes.push(ColumnIndex::Empty {
num_docs: num_docs_per_column[i],
});
column_indexes.push(None);
column_values.push(None);
}
}
@@ -166,19 +124,15 @@ fn merge_column(
serialize_column_mappable_to_u64(merged_column_index, &merge_column_values, wrt)?;
}
ColumnType::IpAddr => {
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
let mut column_indexes: Vec<Option<ColumnIndex>> = Vec::with_capacity(columns.len());
let mut column_values: Vec<Option<Arc<dyn ColumnValues<Ipv6Addr>>>> =
Vec::with_capacity(columns.len());
for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
if let Some(DynamicColumn::IpAddr(Column { index: idx, values })) =
dynamic_column_opt
{
column_indexes.push(idx);
for dynamic_column_opt in columns {
if let Some(DynamicColumn::IpAddr(Column { idx, values })) = dynamic_column_opt {
column_indexes.push(Some(idx));
column_values.push(Some(values));
} else {
column_indexes.push(ColumnIndex::Empty {
num_docs: num_docs_per_column[i],
});
column_indexes.push(None);
column_values.push(None);
}
}
@@ -194,22 +148,20 @@ fn merge_column(
serialize_column_mappable_to_u128(merged_column_index, &merge_column_values, wrt)?;
}
ColumnType::Bytes | ColumnType::Str => {
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
let mut column_indexes: Vec<Option<ColumnIndex>> = Vec::with_capacity(columns.len());
let mut bytes_columns: Vec<Option<BytesColumn>> = Vec::with_capacity(columns.len());
for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
for dynamic_column_opt in columns {
match dynamic_column_opt {
Some(DynamicColumn::Str(str_column)) => {
column_indexes.push(str_column.term_ord_column.index.clone());
column_indexes.push(Some(str_column.term_ord_column.idx.clone()));
bytes_columns.push(Some(str_column.into()));
}
Some(DynamicColumn::Bytes(bytes_column)) => {
column_indexes.push(bytes_column.term_ord_column.index.clone());
column_indexes.push(Some(bytes_column.term_ord_column.idx.clone()));
bytes_columns.push(Some(bytes_column));
}
_ => {
column_indexes.push(ColumnIndex::Empty {
num_docs: num_docs_per_column[i],
});
column_indexes.push(None);
bytes_columns.push(None);
}
}
@@ -222,264 +174,98 @@ fn merge_column(
Ok(())
}
struct GroupedColumns {
required_column_type: Option<ColumnType>,
columns: Vec<Option<DynamicColumn>>,
}
#[allow(clippy::type_complexity)]
fn group_columns_for_merge(
columnar_readers: &[&ColumnarReader],
) -> io::Result<BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>>> {
// Each column name may have multiple types of column associated.
// For merging we are interested in the same column type category since they can be merged.
let mut columns_grouped: HashMap<(String, ColumnTypeCategory), Vec<Option<DynamicColumn>>> =
HashMap::new();
impl GroupedColumns {
/// Check is column group can be skipped during serialization.
fn is_empty(&self) -> bool {
self.required_column_type.is_none() && self.columns.iter().all(Option::is_none)
let num_columnars = columnar_readers.len();
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
let column_name_and_handle = columnar_reader.list_columns()?;
for (column_name, handle) in column_name_and_handle {
let column_type_category: ColumnTypeCategory = handle.column_type().into();
let columns = columns_grouped
.entry((column_name, column_type_category))
.or_insert_with(|| vec![None; num_columnars]);
let column = handle.open()?;
columns[columnar_id] = Some(column);
}
}
/// Returns the column type after merge.
///
/// This method does not check if the column types can actually be coerced to
/// this type.
fn column_type_after_merge(&self) -> ColumnType {
if let Some(required_type) = self.required_column_type {
return required_type;
let mut merge_columns: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
BTreeMap::default();
for ((column_name, col_category), mut columns) in columns_grouped {
if col_category == ColumnTypeCategory::Numerical {
coerce_numerical_columns_to_same_type(&mut columns);
}
let column_type: HashSet<ColumnType> = self
.columns
let column_type = columns
.iter()
.flatten()
.map(|column| column.column_type())
.collect();
if column_type.len() == 1 {
return column_type.into_iter().next().unwrap();
}
// At the moment, only the numerical categorical column type has more than one possible
// column type.
assert!(self
.columns
.iter()
.flatten()
.all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
merged_numerical_columns_type(self.columns.iter().flatten()).into()
.map(|col| col.column_type())
.next()
.unwrap();
merge_columns.insert((column_name, column_type), columns);
}
Ok(merge_columns)
}
struct GroupedColumnsHandle {
required_column_type: Option<ColumnType>,
columns: Vec<Option<DynamicColumnHandle>>,
}
impl GroupedColumnsHandle {
fn new(num_columnars: usize) -> Self {
GroupedColumnsHandle {
required_column_type: None,
columns: vec![None; num_columnars],
}
}
fn open(self, merge_row_order: &MergeRowOrder) -> io::Result<GroupedColumns> {
let mut columns: Vec<Option<DynamicColumn>> = Vec::new();
for (columnar_id, column) in self.columns.iter().enumerate() {
if let Some(column) = column {
let column = column.open()?;
// We skip columns that end up with 0 documents.
// That way, we make sure they don't end up influencing the merge type or
// creating empty columns.
if is_empty_after_merge(merge_row_order, &column, columnar_id) {
columns.push(None);
} else {
columns.push(Some(column));
}
} else {
columns.push(None);
}
}
Ok(GroupedColumns {
required_column_type: self.required_column_type,
columns,
})
}
/// Set the dynamic column for a given columnar.
fn set_column(&mut self, columnar_id: usize, column: DynamicColumnHandle) {
self.columns[columnar_id] = Some(column);
}
/// Force the existence of a column, as well as its type.
fn require_type(&mut self, required_type: ColumnType) -> io::Result<()> {
if let Some(existing_required_type) = self.required_column_type {
if existing_required_type == required_type {
// This was just a duplicate in the `required_columns`.
// Nothing to do.
return Ok(());
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Required column conflicts with another required column of the same type \
category.",
));
}
}
self.required_column_type = Some(required_type);
Ok(())
}
}
/// Returns the type of the merged numerical column.
/// Coerce a set of numerical columns to the same type.
///
/// This function picks the first numerical type out of i64, u64, f64 (order matters
/// here), that is compatible with all the `columns`.
///
/// # Panics
/// Panics if one of the column is not numerical.
fn merged_numerical_columns_type<'a>(
columns: impl Iterator<Item = &'a DynamicColumn>,
) -> NumericalType {
/// If all columns are already from the same type, keep this type
/// (even if they could all be coerced to i64).
fn coerce_numerical_columns_to_same_type(columns: &mut [Option<DynamicColumn>]) {
let mut column_types: HashSet<NumericalType> = HashSet::default();
let mut compatible_numerical_types = CompatibleNumericalTypes::default();
for column in columns {
let (min_value, max_value) =
min_max_if_numerical(column).expect("All columns re required to be numerical");
for column in columns.iter().flatten() {
let min_value: NumericalValue;
let max_value: NumericalValue;
match column {
DynamicColumn::I64(column) => {
min_value = column.min_value().into();
max_value = column.max_value().into();
}
DynamicColumn::U64(column) => {
min_value = column.min_value().into();
max_value = column.min_value().into();
}
DynamicColumn::F64(column) => {
min_value = column.min_value().into();
max_value = column.min_value().into();
}
DynamicColumn::Bool(_)
| DynamicColumn::IpAddr(_)
| DynamicColumn::DateTime(_)
| DynamicColumn::Bytes(_)
| DynamicColumn::Str(_) => {
panic!("We expected only numerical columns.");
}
}
column_types.insert(column.column_type().numerical_type().unwrap());
compatible_numerical_types.accept_value(min_value);
compatible_numerical_types.accept_value(max_value);
}
compatible_numerical_types.to_numerical_type()
}
if column_types.len() <= 1 {
// No need to do anything. The columns are already all from the same type.
// This is necessary to let use force a given type.
fn is_empty_after_merge(
merge_row_order: &MergeRowOrder,
column: &DynamicColumn,
columnar_ord: usize,
) -> bool {
if column.num_values() == 0u32 {
// It was empty before the merge.
return true;
// TODO This works in a world where we do not allow a change of schema,
// but in the future, we will have to pass some kind of schema to enforce
// the logic.
return;
}
match merge_row_order {
MergeRowOrder::Stack(_) => {
// If we are stacking the columnar, no rows are being deleted.
false
}
MergeRowOrder::Shuffled(shuffled) => {
if let Some(alive_bitset) = &shuffled.alive_bitsets[columnar_ord] {
let column_index = column.column_index();
match column_index {
ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_rows() {
if alive_bitset.contains(doc) {
return false;
}
}
true
}
ColumnIndex::Multivalued(multivalued_index) => {
for (doc_id, (start_index, end_index)) in multivalued_index
.start_index_column
.iter()
.tuple_windows()
.enumerate()
{
let doc_id = doc_id as u32;
if start_index == end_index {
// There are no values in this document
continue;
}
// The document contains values and is present in the alive bitset.
// The column is therefore not empty.
if alive_bitset.contains(doc_id) {
return false;
}
}
true
}
}
} else {
// No document is being deleted.
// The shuffle is applying a permutation.
false
}
}
}
}
/// Iterates over the columns of the columnar readers, grouped by column name.
/// Key functionality is that `open` of the Columns is done lazy per group.
fn group_columns_for_merge<'a>(
columnar_readers: &'a [&'a ColumnarReader],
required_columns: &'a [(String, ColumnType)],
_merge_row_order: &'a MergeRowOrder,
) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();
for &(ref column_name, column_type) in required_columns {
columns
.entry((column_name.clone(), column_type.into()))
.or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
.require_type(column_type)?;
}
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
let column_name_and_handle = columnar_reader.iter_columns()?;
for (column_name, handle) in column_name_and_handle {
let column_category: ColumnTypeCategory = handle.column_type().into();
columns
.entry((column_name, column_category))
.or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
.set_column(columnar_id, handle);
}
}
Ok(columns)
}
fn coerce_columns(
column_type: ColumnType,
columns: &mut [Option<DynamicColumn>],
) -> io::Result<()> {
let coerce_type = compatible_numerical_types.to_numerical_type();
for column_opt in columns.iter_mut() {
if let Some(column) = column_opt.take() {
*column_opt = Some(coerce_column(column_type, column)?);
*column_opt = column.coerce_numerical(coerce_type);
}
}
Ok(())
}
fn coerce_column(column_type: ColumnType, column: DynamicColumn) -> io::Result<DynamicColumn> {
if let Some(numerical_type) = column_type.numerical_type() {
column
.coerce_numerical(numerical_type)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, ""))
} else {
if column.column_type() != column_type {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!(
"Cannot coerce column of type `{:?}` to `{column_type:?}`",
column.column_type()
),
));
}
Ok(column)
}
}
/// Returns the (min, max) of a column provided it is numerical (i64, u64. f64).
///
/// The min and the max are simply the numerical value as defined by `ColumnValue::min_value()`,
/// and `ColumnValue::max_value()`.
///
/// It is important to note that these values are only guaranteed to be lower/upper bound
/// (as opposed to min/max value).
/// If a column is empty, the min and max values are currently set to 0.
fn min_max_if_numerical(column: &DynamicColumn) -> Option<(NumericalValue, NumericalValue)> {
match column {
DynamicColumn::I64(column) => Some((column.min_value().into(), column.max_value().into())),
DynamicColumn::U64(column) => Some((column.min_value().into(), column.max_value().into())),
DynamicColumn::F64(column) => Some((column.min_value().into(), column.max_value().into())),
DynamicColumn::Bool(_)
| DynamicColumn::IpAddr(_)
| DynamicColumn::DateTime(_)
| DynamicColumn::Bytes(_)
| DynamicColumn::Str(_) => None,
}
}
#[cfg(test)]

View File

@@ -0,0 +1,107 @@
use std::sync::Arc;
use fastfield_codecs::Column;
use itertools::Itertools;
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
use crate::SegmentReader;
pub(crate) struct RemappedDocIdColumn<'a> {
doc_id_mapping: &'a SegmentDocIdMapping,
fast_field_readers: Vec<Arc<dyn Column<u64>>>,
min_value: u64,
max_value: u64,
num_vals: u32,
}
fn compute_min_max_val(
u64_reader: &dyn Column<u64>,
segment_reader: &SegmentReader,
) -> Option<(u64, u64)> {
if segment_reader.max_doc() == 0 {
return None;
}
if segment_reader.alive_bitset().is_none() {
// no deleted documents,
// we can use the previous min_val, max_val.
return Some((u64_reader.min_value(), u64_reader.max_value()));
}
// some deleted documents,
// we need to recompute the max / min
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get_val(doc_id))
.minmax()
.into_option()
}
impl<'a> RemappedDocIdColumn<'a> {
pub(crate) fn new(
readers: &'a [SegmentReader],
doc_id_mapping: &'a SegmentDocIdMapping,
field: &str,
) -> Self {
let (min_value, max_value) = readers
.iter()
.filter_map(|reader| {
let u64_reader: Arc<dyn Column<u64>> =
reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.",
);
compute_min_max_val(&*u64_reader, reader)
})
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
.expect("Unexpected error, empty readers in IndexMerger");
let fast_field_readers = readers
.iter()
.map(|reader| {
let u64_reader: Arc<dyn Column<u64>> =
reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.",
);
u64_reader
})
.collect::<Vec<_>>();
RemappedDocIdColumn {
doc_id_mapping,
fast_field_readers,
min_value,
max_value,
num_vals: doc_id_mapping.len() as u32,
}
}
}
impl<'a> Column for RemappedDocIdColumn<'a> {
fn get_val(&self, _doc: u32) -> u64 {
unimplemented!()
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(
self.doc_id_mapping
.iter_old_doc_addrs()
.map(|old_doc_addr| {
let fast_field_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize];
fast_field_reader.get_val(old_doc_addr.doc_id)
}),
)
}
fn min_value(&self) -> u64 {
self.min_value
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u32 {
self.num_vals
}
}

View File

@@ -1,7 +1,3 @@
use std::collections::BTreeMap;
use itertools::Itertools;
use super::*;
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
@@ -27,118 +23,51 @@ fn test_column_coercion_to_u64() {
let columnar1 = make_columnar("numbers", &[1i64]);
// u64 type
let columnar2 = make_columnar("numbers", &[u64::MAX]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2]).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_column_no_coercion_if_all_the_same() {
let columnar1 = make_columnar("numbers", &[1u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2]).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_column_coercion_to_i64() {
let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2]).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
}
//#[test]
// fn test_impossible_coercion_returns_an_error() {
// let columnar1 = make_columnar("numbers", &[u64::MAX]);
// let merge_order = StackMergeOrder::stack(&[&columnar1]).into();
// let group_error = group_columns_for_merge_iter(
//&[&columnar1],
//&[("numbers".to_string(), ColumnType::I64)],
//&merge_order,
//)
//.unwrap_err();
// assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
//}
#[test]
fn test_group_columns_with_required_column() {
let columnar1 = make_columnar("numbers", &[1i64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(
&[&columnar1, &columnar2],
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
}
#[test]
fn test_group_columns_required_column_with_no_existing_columns() {
let columnar1 = make_columnar("numbers", &[2u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<_, _> = group_columns_for_merge(
columnars,
&[("required_col".to_string(), ColumnType::Str)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 2);
let columns = &column_map
.get(&("required_col".to_string(), ColumnTypeCategory::Str))
.unwrap()
.columns;
assert_eq!(columns.len(), 2);
assert!(columns[0].is_none());
assert!(columns[1].is_none());
}
#[test]
fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_rule() {
let columnar1 = make_columnar("numbers", &[2i64]);
let columnar2 = make_columnar("numbers", &[2i64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(
columnars,
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
}
#[test]
fn test_missing_column() {
let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers2", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2]).unwrap();
assert_eq!(column_map.len(), 2);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
{
let columns = &column_map
.get(&("numbers".to_string(), ColumnTypeCategory::Numerical))
.unwrap()
.columns;
let columns = column_map
.get(&("numbers".to_string(), ColumnType::I64))
.unwrap();
assert!(columns[0].is_some());
assert!(columns[1].is_none());
}
{
let columns = &column_map
.get(&("numbers2".to_string(), ColumnTypeCategory::Numerical))
.unwrap()
.columns;
let columns = column_map
.get(&("numbers2".to_string(), ColumnType::U64))
.unwrap();
assert!(columns[0].is_none());
assert!(columns[1].is_some());
}
@@ -167,24 +96,20 @@ fn make_numerical_columnar_multiple_columns(
ColumnarReader::open(buffer).unwrap()
}
#[track_caller]
fn make_byte_columnar_multiple_columns(
columns: &[(&str, &[&[&[u8]]])],
num_rows: u32,
) -> ColumnarReader {
fn make_byte_columnar_multiple_columns(columns: &[(&str, &[&[&[u8]]])]) -> ColumnarReader {
let mut dataframe_writer = ColumnarWriter::default();
for (column_name, column_values) in columns {
assert_eq!(
column_values.len(),
num_rows as usize,
"All columns must have `{num_rows}` rows"
);
for (row_id, vals) in column_values.iter().enumerate() {
for val in vals.iter() {
dataframe_writer.record_bytes(row_id as u32, column_name, val);
dataframe_writer.record_bytes(row_id as u32, column_name, *val);
}
}
}
let num_rows = columns
.iter()
.map(|(_, val_rows)| val_rows.len() as RowId)
.max()
.unwrap_or(0u32);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_rows, None, &mut buffer)
@@ -197,7 +122,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column
for (column_name, column_values) in columns {
for (row_id, vals) in column_values.iter().enumerate() {
for val in vals.iter() {
dataframe_writer.record_str(row_id as u32, column_name, val);
dataframe_writer.record_str(row_id as u32, column_name, *val);
}
}
}
@@ -226,7 +151,6 @@ fn test_merge_columnar_numbers() {
let stack_merge_order = StackMergeOrder::stack(columnars);
crate::columnar::merge_columnar(
columnars,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut buffer,
)
@@ -236,9 +160,7 @@ fn test_merge_columnar_numbers() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("numbers").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::F64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.first(0u32), Some(-1f64));
assert_eq!(vals.first(1u32), None);
@@ -254,7 +176,6 @@ fn test_merge_columnar_texts() {
let stack_merge_order = StackMergeOrder::stack(columnars);
crate::columnar::merge_columnar(
columnars,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut buffer,
)
@@ -264,11 +185,7 @@ fn test_merge_columnar_texts() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("texts").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
let get_str_for_ord = |ord| {
let mut out = String::new();
vals.ord_to_str(ord, &mut out).unwrap();
@@ -296,14 +213,13 @@ fn test_merge_columnar_texts() {
#[test]
fn test_merge_columnar_byte() {
let columnar1 = make_byte_columnar_multiple_columns(&[("bytes", &[&[b"bbbb"], &[b"baaa"]])], 2);
let columnar2 = make_byte_columnar_multiple_columns(&[("bytes", &[&[], &[b"a"]])], 2);
let columnar1 = make_byte_columnar_multiple_columns(&[("bytes", &[&[b"bbbb"], &[b"baaa"]])]);
let columnar2 = make_byte_columnar_multiple_columns(&[("bytes", &[&[], &[b"a"]])]);
let mut buffer = Vec::new();
let columnars = &[&columnar1, &columnar2];
let stack_merge_order = StackMergeOrder::stack(columnars);
crate::columnar::merge_columnar(
columnars,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut buffer,
)
@@ -313,9 +229,7 @@ fn test_merge_columnar_byte() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("bytes").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -342,155 +256,3 @@ fn test_merge_columnar_byte() {
assert_eq!(get_bytes_for_row(2), b"");
assert_eq!(get_bytes_for_row(3), b"a");
}
#[test]
fn test_merge_columnar_byte_with_missing() {
let columnar1 = make_byte_columnar_multiple_columns(&[], 3);
let columnar2 = make_byte_columnar_multiple_columns(&[("col", &[&[b"b"], &[]])], 2);
let columnar3 = make_byte_columnar_multiple_columns(
&[
("col", &[&[], &[b"b"], &[b"a", b"b"]]),
("col2", &[&[b"hello"], &[], &[b"a", b"b"]]),
],
3,
);
let mut buffer = Vec::new();
let columnars = &[&columnar1, &columnar2, &columnar3];
let stack_merge_order = StackMergeOrder::stack(columnars);
crate::columnar::merge_columnar(
columnars,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut buffer,
)
.unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_rows(), 3 + 2 + 3);
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("col").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
out
};
assert_eq!(vals.dictionary.num_terms(), 2);
assert_eq!(get_bytes_for_ord(0), b"a");
assert_eq!(get_bytes_for_ord(1), b"b");
let get_bytes_for_row = |row_id| {
let terms: Vec<Vec<u8>> = vals
.term_ords(row_id)
.map(|term_ord| {
let mut out = Vec::new();
vals.ord_to_bytes(term_ord, &mut out).unwrap();
out
})
.collect();
terms
};
assert!(get_bytes_for_row(0).is_empty());
assert!(get_bytes_for_row(1).is_empty());
assert!(get_bytes_for_row(2).is_empty());
assert_eq!(get_bytes_for_row(3), vec![b"b".to_vec()]);
assert!(get_bytes_for_row(4).is_empty());
assert!(get_bytes_for_row(5).is_empty());
assert_eq!(get_bytes_for_row(6), vec![b"b".to_vec()]);
assert_eq!(get_bytes_for_row(7), vec![b"a".to_vec(), b"b".to_vec()]);
}
#[test]
fn test_merge_columnar_different_types() {
let columnar1 = make_text_columnar_multiple_columns(&[("mixed", &[&["a"]])]);
let columnar2 = make_text_columnar_multiple_columns(&[("mixed", &[&[], &["b"]])]);
let columnar3 = make_columnar("mixed", &[1i64]);
let mut buffer = Vec::new();
let columnars = &[&columnar1, &columnar2, &columnar3];
let stack_merge_order = StackMergeOrder::stack(columnars);
crate::columnar::merge_columnar(
columnars,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut buffer,
)
.unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_rows(), 4);
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("mixed").unwrap();
// numeric column
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::I64(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(2).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(3).collect_vec(), vec![1]);
assert_eq!(vals.values_for_doc(4).collect_vec(), vec![]);
// text column
let dynamic_column = cols[1].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
let mut out = String::new();
vals.ord_to_str(ord, &mut out).unwrap();
out
};
assert_eq!(vals.dictionary.num_terms(), 2);
assert_eq!(get_str_for_ord(0), "a");
assert_eq!(get_str_for_ord(1), "b");
let get_str_for_row = |row_id| {
let term_ords: Vec<String> = vals
.term_ords(row_id)
.map(|el| {
let mut out = String::new();
vals.ord_to_str(el, &mut out).unwrap();
out
})
.collect();
term_ords
};
assert_eq!(get_str_for_row(0), vec!["a".to_string()]);
assert_eq!(get_str_for_row(1), Vec::<String>::new());
assert_eq!(get_str_for_row(2), vec!["b".to_string()]);
assert_eq!(get_str_for_row(3), Vec::<String>::new());
}
#[test]
fn test_merge_columnar_different_empty_cardinality() {
let columnar1 = make_text_columnar_multiple_columns(&[("mixed", &[&["a"]])]);
let columnar2 = make_columnar("mixed", &[1i64]);
let mut buffer = Vec::new();
let columnars = &[&columnar1, &columnar2];
let stack_merge_order = StackMergeOrder::stack(columnars);
crate::columnar::merge_columnar(
columnars,
&[],
MergeRowOrder::Stack(stack_merge_order),
&mut buffer,
)
.unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_rows(), 2);
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("mixed").unwrap();
// numeric column
let dynamic_column = cols[0].open().unwrap();
assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional);
// text column
let dynamic_column = cols[1].open().unwrap();
assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional);
}

View File

@@ -0,0 +1 @@

View File

@@ -1,12 +1,11 @@
mod column_type;
mod format_version;
mod merge;
mod merge_index;
mod reader;
mod writer;
pub use column_type::{ColumnType, HasAssociatedColumnType};
#[cfg(test)]
pub(crate) use merge::ColumnTypeCategory;
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
pub use reader::ColumnarReader;
pub use writer::ColumnarWriter;

View File

@@ -1,4 +1,4 @@
use std::{fmt, io, mem};
use std::{io, mem};
use common::file_slice::FileSlice;
use common::BinarySerializable;
@@ -21,58 +21,6 @@ pub struct ColumnarReader {
num_rows: RowId,
}
impl fmt::Debug for ColumnarReader {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let num_rows = self.num_rows();
let columns = self.list_columns().unwrap();
let num_cols = columns.len();
let mut debug_struct = f.debug_struct("Columnar");
debug_struct
.field("num_rows", &num_rows)
.field("num_cols", &num_cols);
for (col_name, dynamic_column_handle) in columns.into_iter().take(5) {
let col = dynamic_column_handle.open().unwrap();
if col.num_values() > 10 {
debug_struct.field(&col_name, &"..");
} else {
debug_struct.field(&col_name, &col);
}
}
if num_cols > 5 {
debug_struct.finish_non_exhaustive()?;
} else {
debug_struct.finish()?;
}
Ok(())
}
}
/// Functions by both the async/sync code listing columns.
/// It takes a stream from the column sstable and return the list of
/// `DynamicColumn` available in it.
fn read_all_columns_in_stream(
mut stream: sstable::Streamer<'_, RangeSSTable>,
column_data: &FileSlice,
) -> io::Result<Vec<DynamicColumnHandle>> {
let mut results = Vec::new();
while stream.advance() {
let key_bytes: &[u8] = stream.key();
let Some(column_code) = key_bytes.last().copied() else {
return Err(io_invalid_data("Empty column name.".to_string()));
};
let column_type = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value();
let file_slice = column_data.slice(range.start as usize..range.end as usize);
let dynamic_column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
results.push(dynamic_column_handle);
}
Ok(results)
}
impl ColumnarReader {
/// Opens a new Columnar file.
pub fn open<F>(file_slice: F) -> io::Result<ColumnarReader>
@@ -102,44 +50,37 @@ impl ColumnarReader {
pub fn num_rows(&self) -> RowId {
self.num_rows
}
// Iterate over the columns in a sorted way
pub fn iter_columns(
&self,
) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
let mut stream = self.column_dictionary.stream()?;
Ok(std::iter::from_fn(move || {
if stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
// TODO Error Handling. The API gets quite ugly when returning the error here, so
// instead we could just check the first N columns upfront.
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
.unwrap();
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
Some((column_name, column_handle))
} else {
None
}
}))
}
// TODO Add unit tests
pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
Ok(self.iter_columns()?.collect())
let mut stream = self.column_dictionary.stream()?;
let mut results = Vec::new();
while stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
results.push((column_name, column_handle));
}
Ok(results)
}
fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {
/// Get all columns for the given column name.
///
/// There can be more than one column associated to a given column name, provided they have
/// different types.
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
// Each column is a associated to a given `column_key`,
// that starts by `column_name\0column_header`.
//
@@ -148,35 +89,36 @@ impl ColumnarReader {
//
// This is in turn equivalent to searching for the range
// `[column_name,\0`..column_name\1)`.
// TODO can we get some more generic `prefix(..)` logic in the dictionary.
// TODO can we get some more generic `prefix(..)` logic in the dictioanry.
let mut start_key = column_name.to_string();
start_key.push('\0');
let mut end_key = column_name.to_string();
end_key.push(1u8 as char);
self.column_dictionary
let mut stream = self
.column_dictionary
.range()
.ge(start_key.as_bytes())
.lt(end_key.as_bytes())
}
pub async fn read_columns_async(
&self,
column_name: &str,
) -> io::Result<Vec<DynamicColumnHandle>> {
let stream = self
.stream_for_column_range(column_name)
.into_stream_async()
.await?;
read_all_columns_in_stream(stream, &self.column_data)
}
/// Get all columns for the given column name.
///
/// There can be more than one column associated to a given column name, provided they have
/// different types.
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
let stream = self.stream_for_column_range(column_name).into_stream()?;
read_all_columns_in_stream(stream, &self.column_data)
.into_stream()?;
let mut results = Vec::new();
while stream.advance() {
let key_bytes: &[u8] = stream.key();
assert!(key_bytes.starts_with(start_key.as_bytes()));
let column_code: u8 = key_bytes.last().cloned().unwrap();
let column_type = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let dynamic_column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
results.push(dynamic_column_handle);
}
Ok(results)
}
/// Return the number of columns in the columnar.

View File

@@ -310,7 +310,7 @@ mod tests {
buffer.extend_from_slice(b"234234");
let mut bytes = &buffer[..];
let serdeser_symbol = ColumnOperation::deserialize(&mut bytes).unwrap();
assert_eq!(bytes.len() + buf.as_ref().len(), buffer.len());
assert_eq!(bytes.len() + buf.as_ref().len() as usize, buffer.len());
assert_eq!(column_op, serdeser_symbol);
}
@@ -341,7 +341,7 @@ mod tests {
fn test_column_operation_unordered_aux(val: u32, expected_len: usize) {
let column_op = ColumnOperation::Value(UnorderedId(val));
let minibuf = column_op.serialize();
assert_eq!({ minibuf.as_ref().len() }, expected_len);
assert_eq!(minibuf.as_ref().len() as usize, expected_len);
let mut buf = minibuf.as_ref().to_vec();
buf.extend_from_slice(&[2, 2, 2, 2, 2, 2]);
let mut cursor = &buf[..];

View File

@@ -79,6 +79,7 @@ fn mutate_or_create_column<V, TMutator>(
impl ColumnarWriter {
pub fn mem_usage(&self) -> usize {
// TODO add dictionary builders.
self.arena.mem_usage()
+ self.numerical_field_hash_map.mem_usage()
+ self.bool_field_hash_map.mem_usage()
@@ -86,11 +87,6 @@ impl ColumnarWriter {
+ self.str_field_hash_map.mem_usage()
+ self.ip_addr_field_hash_map.mem_usage()
+ self.datetime_field_hash_map.mem_usage()
+ self
.dictionaries
.iter()
.map(|dict| dict.mem_usage())
.sum::<usize>()
}
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
@@ -102,37 +98,22 @@ impl ColumnarWriter {
///
/// The sort applied is stable.
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
.or_else(|| {
self.datetime_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
})
else {
return Vec::new();
let Some(numerical_col_writer) =
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
return Vec::new();
};
let mut symbols_buffer = Vec::new();
let mut values = Vec::new();
let mut start_doc_check_fill = 0;
let mut current_doc_opt: Option<RowId> = None;
// Assumption: NewDoc will never call the same doc twice and is strictly increasing between
// calls
let mut last_doc_opt: Option<RowId> = None;
for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
match op {
ColumnOperation::NewDoc(doc) => {
current_doc_opt = Some(doc);
last_doc_opt = Some(doc);
}
ColumnOperation::Value(numerical_value) => {
if let Some(current_doc) = current_doc_opt {
// Fill up with 0.0 since last doc
values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
start_doc_check_fill = current_doc + 1;
// handle multi values
current_doc_opt = None;
if let Some(last_doc) = last_doc_opt {
let score: f32 = f64::coerce(numerical_value) as f32;
values.push((score, current_doc));
values.push((score, last_doc));
}
}
}
@@ -142,9 +123,9 @@ impl ColumnarWriter {
}
values.sort_by(|(left_score, _), (right_score, _)| {
if reversed {
right_score.total_cmp(left_score)
right_score.partial_cmp(left_score).unwrap()
} else {
left_score.total_cmp(right_score)
left_score.partial_cmp(right_score).unwrap()
}
});
values.into_iter().map(|(_score, doc)| doc).collect()
@@ -276,7 +257,7 @@ impl ColumnarWriter {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_nanos()),
NumericalValue::I64(datetime.into_timestamp_micros()),
arena,
);
column
@@ -338,7 +319,7 @@ impl ColumnarWriter {
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map
.iter()
.map(|(column_name, addr)| {
.map(|(column_name, addr, _)| {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let column_type = numerical_column_writer.numerical_type().into();
@@ -348,27 +329,27 @@ impl ColumnarWriter {
columns.extend(
self.bytes_field_hash_map
.iter()
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
.map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
);
columns.extend(
self.str_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::Str, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
);
columns.extend(
self.bool_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::Bool, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
);
columns.extend(
self.ip_addr_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::IpAddr, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
);
columns.extend(
self.datetime_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
);
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
@@ -380,7 +361,7 @@ impl ColumnarWriter {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, column_type);
serializer.serialize_column(column_name, column_type);
serialize_bool_column(
cardinality,
num_docs,
@@ -392,13 +373,12 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::IpAddr => {
let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, ColumnType::IpAddr);
serializer.serialize_column(column_name, ColumnType::IpAddr);
serialize_ip_addr_column(
cardinality,
num_docs,
@@ -410,7 +390,6 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::Bytes | ColumnType::Str => {
let str_or_bytes_column_writer: StrOrBytesColumnWriter =
@@ -425,7 +404,7 @@ impl ColumnarWriter {
.column_writer
.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, column_type);
serializer.serialize_column(column_name, column_type);
serialize_bytes_or_str_column(
cardinality,
num_docs,
@@ -439,14 +418,13 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::F64 | ColumnType::I64 | ColumnType::U64 => {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let cardinality = numerical_column_writer.cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, column_type);
serializer.serialize_column(column_name, column_type);
let numerical_type = column_type.numerical_type().unwrap();
serialize_numerical_column(
cardinality,
@@ -460,13 +438,12 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::DateTime => {
let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, ColumnType::DateTime);
serializer.serialize_column(column_name, ColumnType::DateTime);
serialize_numerical_column(
cardinality,
num_docs,
@@ -479,7 +456,6 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
};
}
@@ -785,7 +761,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&mut arena, None, &mut buffer)
.collect();
assert_eq!(symbols.len(), 6);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -814,7 +790,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&mut arena, None, &mut buffer)
.collect();
assert_eq!(symbols.len(), 4);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
@@ -837,7 +813,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&mut arena, None, &mut buffer)
.collect();
assert_eq!(symbols.len(), 2);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -856,7 +832,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&mut arena, None, &mut buffer)
.collect();
assert_eq!(symbols.len(), 3);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));

View File

@@ -34,12 +34,11 @@ impl<W: io::Write> ColumnarSerializer<W> {
}
}
/// Creates a ColumnSerializer.
pub fn start_serialize_column<'a>(
pub fn serialize_column<'a>(
&'a mut self,
column_name: &[u8],
column_type: ColumnType,
) -> ColumnSerializer<'a, W> {
) -> impl io::Write + 'a {
let start_offset = self.wrt.written_bytes();
prepare_key(column_name, column_type, &mut self.prepare_key_buffer);
ColumnSerializer {
@@ -61,21 +60,20 @@ impl<W: io::Write> ColumnarSerializer<W> {
}
}
pub struct ColumnSerializer<'a, W: io::Write> {
struct ColumnSerializer<'a, W: io::Write> {
columnar_serializer: &'a mut ColumnarSerializer<W>,
start_offset: u64,
}
impl<'a, W: io::Write> ColumnSerializer<'a, W> {
pub fn finalize(self) -> io::Result<()> {
impl<'a, W: io::Write> Drop for ColumnSerializer<'a, W> {
fn drop(&mut self) {
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
let byte_range = self.start_offset..end_offset;
self.columnar_serializer.sstable_range.insert(
self.columnar_serializer.sstable_range.insert_cannot_fail(
&self.columnar_serializer.prepare_key_buffer[..],
&byte_range,
)?;
);
self.columnar_serializer.prepare_key_buffer.clear();
Ok(())
}
}

View File

@@ -150,7 +150,11 @@ mod tests {
multivalued_value_index_builder.record_row(2u32);
multivalued_value_index_builder.record_value();
assert_eq!(
multivalued_value_index_builder.finish(4u32).to_vec(),
multivalued_value_index_builder
.finish(4u32)
.iter()
.copied()
.collect::<Vec<u32>>(),
vec![0, 0, 2, 3, 3]
);
multivalued_value_index_builder.reset();
@@ -158,7 +162,11 @@ mod tests {
multivalued_value_index_builder.record_value();
multivalued_value_index_builder.record_value();
assert_eq!(
multivalued_value_index_builder.finish(4u32).to_vec(),
multivalued_value_index_builder
.finish(4u32)
.iter()
.copied()
.collect::<Vec<u32>>(),
vec![0, 0, 0, 2, 2]
);
}

View File

@@ -32,7 +32,6 @@ pub struct OrderedId(pub u32);
#[derive(Default)]
pub(crate) struct DictionaryBuilder {
dict: FnvHashMap<Vec<u8>, UnorderedId>,
memory_consumption: usize,
}
impl DictionaryBuilder {
@@ -44,8 +43,6 @@ impl DictionaryBuilder {
}
let new_id = UnorderedId(self.dict.len() as u32);
self.dict.insert(term.to_vec(), new_id);
self.memory_consumption += term.len();
self.memory_consumption += 40; // Term Metadata + HashMap overhead
new_id
}
@@ -66,10 +63,6 @@ impl DictionaryBuilder {
sstable_builder.finish()?;
Ok(TermIdMapping { unordered_to_ord })
}
pub(crate) fn mem_usage(&self) -> usize {
self.memory_consumption
}
}
#[cfg(test)]

View File

@@ -1,14 +1,14 @@
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use std::{fmt, io};
use common::file_slice::FileSlice;
use common::{ByteCount, DateTime, HasLen, OwnedBytes};
use common::{DateTime, HasLen, OwnedBytes};
use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
use crate::columnar::ColumnType;
use crate::{Cardinality, ColumnIndex, NumericalType};
use crate::{Cardinality, NumericalType};
#[derive(Clone)]
pub enum DynamicColumn {
@@ -22,54 +22,19 @@ pub enum DynamicColumn {
Str(StrColumn),
}
impl fmt::Debug for DynamicColumn {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "[{} {} |", self.get_cardinality(), self.column_type())?;
match self {
DynamicColumn::Bool(col) => write!(f, " {col:?}")?,
DynamicColumn::I64(col) => write!(f, " {col:?}")?,
DynamicColumn::U64(col) => write!(f, " {col:?}")?,
DynamicColumn::F64(col) => write!(f, "{col:?}")?,
DynamicColumn::IpAddr(col) => write!(f, "{col:?}")?,
DynamicColumn::DateTime(col) => write!(f, "{col:?}")?,
DynamicColumn::Bytes(col) => write!(f, "{col:?}")?,
DynamicColumn::Str(col) => write!(f, "{col:?}")?,
}
write!(f, "]")
}
}
impl DynamicColumn {
pub fn column_index(&self) -> &ColumnIndex {
match self {
DynamicColumn::Bool(c) => &c.index,
DynamicColumn::I64(c) => &c.index,
DynamicColumn::U64(c) => &c.index,
DynamicColumn::F64(c) => &c.index,
DynamicColumn::IpAddr(c) => &c.index,
DynamicColumn::DateTime(c) => &c.index,
DynamicColumn::Bytes(c) => &c.ords().index,
DynamicColumn::Str(c) => &c.ords().index,
}
}
pub fn get_cardinality(&self) -> Cardinality {
self.column_index().get_cardinality()
}
pub fn num_values(&self) -> u32 {
match self {
DynamicColumn::Bool(c) => c.values.num_vals(),
DynamicColumn::I64(c) => c.values.num_vals(),
DynamicColumn::U64(c) => c.values.num_vals(),
DynamicColumn::F64(c) => c.values.num_vals(),
DynamicColumn::IpAddr(c) => c.values.num_vals(),
DynamicColumn::DateTime(c) => c.values.num_vals(),
DynamicColumn::Bytes(c) => c.ords().values.num_vals(),
DynamicColumn::Str(c) => c.ords().values.num_vals(),
DynamicColumn::Bool(c) => c.get_cardinality(),
DynamicColumn::I64(c) => c.get_cardinality(),
DynamicColumn::U64(c) => c.get_cardinality(),
DynamicColumn::F64(c) => c.get_cardinality(),
DynamicColumn::IpAddr(c) => c.get_cardinality(),
DynamicColumn::DateTime(c) => c.get_cardinality(),
DynamicColumn::Bytes(c) => c.ords().get_cardinality(),
DynamicColumn::Str(c) => c.ords().get_cardinality(),
}
}
pub fn column_type(&self) -> ColumnType {
match self {
DynamicColumn::Bool(_) => ColumnType::Bool,
@@ -108,11 +73,11 @@ impl DynamicColumn {
fn coerce_to_f64(self) -> Option<DynamicColumn> {
match self {
DynamicColumn::I64(column) => Some(DynamicColumn::F64(Column {
index: column.index,
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapI64ToF64)),
})),
DynamicColumn::U64(column) => Some(DynamicColumn::F64(Column {
index: column.index,
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapU64ToF64)),
})),
DynamicColumn::F64(_) => Some(self),
@@ -126,7 +91,7 @@ impl DynamicColumn {
return None;
}
Some(DynamicColumn::I64(Column {
index: column.index,
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapU64ToI64)),
}))
}
@@ -141,7 +106,7 @@ impl DynamicColumn {
return None;
}
Some(DynamicColumn::U64(Column {
index: column.index,
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapI64ToU64)),
}))
}
@@ -228,7 +193,7 @@ static_dynamic_conversions!(StrColumn, Str);
static_dynamic_conversions!(BytesColumn, Bytes);
static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct DynamicColumnHandle {
pub(crate) file_slice: FileSlice,
pub(crate) column_type: ColumnType,
@@ -241,13 +206,14 @@ impl DynamicColumnHandle {
self.open_internal(column_bytes)
}
#[doc(hidden)]
pub fn file_slice(&self) -> &FileSlice {
&self.file_slice
// TODO rename load_async
pub async fn open_async(&self) -> io::Result<DynamicColumn> {
let column_bytes: OwnedBytes = self.file_slice.read_bytes_async().await?;
self.open_internal(column_bytes)
}
/// Returns the `u64` fast field reader reader associated with `fields` of types
/// Str, u64, i64, f64, bool, or datetime.
/// Str, u64, i64, f64, or datetime.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
@@ -258,12 +224,9 @@ impl DynamicColumnHandle {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column))
}
ColumnType::Bool => Ok(None),
ColumnType::IpAddr => Ok(None),
ColumnType::Bool
| ColumnType::I64
| ColumnType::U64
| ColumnType::F64
| ColumnType::DateTime => {
ColumnType::I64 | ColumnType::U64 | ColumnType::F64 | ColumnType::DateTime => {
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
Ok(Some(column))
}
@@ -286,8 +249,8 @@ impl DynamicColumnHandle {
Ok(dynamic_column)
}
pub fn num_bytes(&self) -> ByteCount {
self.file_slice.len().into()
pub fn num_bytes(&self) -> usize {
self.file_slice.len()
}
pub fn column_type(&self) -> ColumnType {

View File

@@ -7,12 +7,10 @@ extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
use std::fmt::Display;
use std::io;
mod block_accessor;
mod column;
pub mod column_index;
mod column_index;
pub mod column_values;
mod columnar;
mod dictionary;
@@ -21,12 +19,9 @@ mod iterable;
pub(crate) mod utils;
mod value;
pub use block_accessor::ColumnBlockAccessor;
pub use column::{BytesColumn, Column, StrColumn};
pub use column_index::ColumnIndex;
pub use column_values::{
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
};
pub use column_values::{ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
pub use columnar::{
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
@@ -39,7 +34,7 @@ pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32;
pub type DocId = u32;
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy)]
pub struct RowAddr {
pub segment_ord: u32,
pub row_id: RowId,
@@ -76,17 +71,6 @@ pub enum Cardinality {
Multivalued = 2,
}
impl Display for Cardinality {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let short_str = match self {
Cardinality::Full => "full",
Cardinality::Optional => "opt",
Cardinality::Multivalued => "mult",
};
write!(f, "{short_str}")
}
}
impl Cardinality {
pub fn is_optional(&self) -> bool {
matches!(self, Cardinality::Optional)
@@ -97,6 +81,7 @@ impl Cardinality {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
pub(crate) fn try_from_code(code: u8) -> Result<Cardinality, InvalidData> {
match code {
0 => Ok(Cardinality::Full),

View File

@@ -1,19 +1,10 @@
use std::collections::HashMap;
use std::fmt::Debug;
use std::net::Ipv6Addr;
use common::DateTime;
use proptest::prelude::*;
use proptest::sample::subsequence;
use crate::column_values::MonotonicallyMappableToU128;
use crate::columnar::{ColumnType, ColumnTypeCategory};
use crate::columnar::ColumnType;
use crate::dynamic_column::{DynamicColumn, DynamicColumnHandle};
use crate::value::{Coerce, NumericalValue};
use crate::{
BytesColumn, Cardinality, Column, ColumnarReader, ColumnarWriter, RowAddr, RowId,
ShuffleMergeOrder, StackMergeOrder,
};
use crate::value::NumericalValue;
use crate::{Cardinality, ColumnarReader, ColumnarWriter};
#[test]
fn test_dataframe_writer_str() {
@@ -26,7 +17,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 99);
assert_eq!(cols[0].num_bytes(), 158);
}
#[test]
@@ -40,7 +31,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 99);
assert_eq!(cols[0].num_bytes(), 158);
}
#[test]
@@ -57,9 +48,7 @@ fn test_dataframe_writer_bool() {
assert_eq!(cols[0].num_bytes(), 22);
assert_eq!(cols[0].column_type(), ColumnType::Bool);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
@@ -81,9 +70,7 @@ fn test_dataframe_writer_u64_multivalued() {
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
panic!();
};
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
assert_eq!(
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
@@ -105,9 +92,7 @@ fn test_dataframe_writer_ip_addr() {
assert_eq!(cols[0].num_bytes(), 42);
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!(
&vals,
@@ -140,10 +125,8 @@ fn test_dataframe_writer_numerical() {
// - null footer 6 bytes
assert_eq!(cols[0].num_bytes(), 33);
let column = cols[0].open().unwrap();
let DynamicColumn::I64(column_i64) = column else {
panic!();
};
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
let DynamicColumn::I64(column_i64) = column else { panic!(); };
assert_eq!(column_i64.idx.get_cardinality(), Cardinality::Optional);
assert_eq!(column_i64.first(0), None);
assert_eq!(column_i64.first(1), Some(12i64));
assert_eq!(column_i64.first(2), Some(13i64));
@@ -153,46 +136,6 @@ fn test_dataframe_writer_numerical() {
assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
}
#[test]
fn test_dataframe_sort_by_full() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
let data = dataframe_writer.sort_order("value", 2, false);
assert_eq!(data, vec![0, 1]);
}
#[test]
fn test_dataframe_sort_by_opt() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
let data = dataframe_writer.sort_order("value", 5, false);
// 0, 2, 4 is 0.0
assert_eq!(data, vec![0, 2, 4, 3, 1]);
let data = dataframe_writer.sort_order("value", 5, true);
assert_eq!(
data,
vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
);
}
#[test]
fn test_dataframe_sort_by_multi() {
let mut dataframe_writer = ColumnarWriter::default();
// valid for sort
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
// those are ignored for sort
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
// valid for sort
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
// ignored, would change sort order
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
let data = dataframe_writer.sort_order("value", 4, false);
assert_eq!(data, vec![0, 2, 1, 3]);
}
#[test]
fn test_dictionary_encoded_str() {
let mut buffer = Vec::new();
@@ -206,9 +149,7 @@ fn test_dictionary_encoded_str() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
@@ -240,9 +181,7 @@ fn test_dictionary_encoded_bytes() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.collect();
@@ -271,675 +210,3 @@ fn test_dictionary_encoded_bytes() {
.unwrap();
assert_eq!(term_buffer, b"b");
}
fn num_strategy() -> impl Strategy<Value = NumericalValue> {
prop_oneof![
3 => Just(NumericalValue::U64(0u64)),
3 => Just(NumericalValue::U64(u64::MAX)),
3 => Just(NumericalValue::I64(0i64)),
3 => Just(NumericalValue::I64(i64::MIN)),
3 => Just(NumericalValue::I64(i64::MAX)),
3 => Just(NumericalValue::F64(1.2f64)),
1 => any::<f64>().prop_map(NumericalValue::from),
1 => any::<u64>().prop_map(NumericalValue::from),
1 => any::<i64>().prop_map(NumericalValue::from),
]
}
#[derive(Debug, Clone, Copy)]
enum ColumnValue {
Str(&'static str),
Bytes(&'static [u8]),
Numerical(NumericalValue),
IpAddr(Ipv6Addr),
Bool(bool),
DateTime(DateTime),
}
impl<T: Into<NumericalValue>> From<T> for ColumnValue {
fn from(val: T) -> ColumnValue {
ColumnValue::Numerical(val.into())
}
}
impl ColumnValue {
pub(crate) fn column_type_category(&self) -> ColumnTypeCategory {
match self {
ColumnValue::Str(_) => ColumnTypeCategory::Str,
ColumnValue::Bytes(_) => ColumnTypeCategory::Bytes,
ColumnValue::Numerical(_) => ColumnTypeCategory::Numerical,
ColumnValue::IpAddr(_) => ColumnTypeCategory::IpAddr,
ColumnValue::Bool(_) => ColumnTypeCategory::Bool,
ColumnValue::DateTime(_) => ColumnTypeCategory::DateTime,
}
}
}
fn column_name_strategy() -> impl Strategy<Value = &'static str> {
prop_oneof![Just("c1"), Just("c2")]
}
fn string_strategy() -> impl Strategy<Value = &'static str> {
prop_oneof![Just("a"), Just("b")]
}
fn bytes_strategy() -> impl Strategy<Value = &'static [u8]> {
prop_oneof![Just(&[0u8][..]), Just(&[1u8][..])]
}
// A random column value
fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
prop_oneof![
10 => string_strategy().prop_map(ColumnValue::Str),
1 => bytes_strategy().prop_map(ColumnValue::Bytes),
40 => num_strategy().prop_map(ColumnValue::Numerical),
1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new(
127,
0,
0,
0,
0,
0,
0,
ip_addr_byte
))),
1 => any::<bool>().prop_map(ColumnValue::Bool),
1 => (0_679_723_993i64..1_679_723_995i64)
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
]
}
// A document contains up to 4 values.
fn doc_strategy() -> impl Strategy<Value = Vec<(&'static str, ColumnValue)>> {
proptest::collection::vec((column_name_strategy(), column_value_strategy()), 0..=4)
}
fn num_docs_strategy() -> impl Strategy<Value = usize> {
prop_oneof!(
// We focus heavily on the 0..2 case as we assume it is sufficient to cover all edge cases.
0usize..=3usize,
// We leave 50% of the effort exploring more defensively.
3usize..=12usize
)
}
// A columnar contains up to 2 docs.
fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, ColumnValue)>>> {
num_docs_strategy()
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
}
fn columnar_docs_and_mapping_strategy(
) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
columnar_docs_strategy().prop_flat_map(|docs| {
permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
})
}
fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
Just((0u32..n as RowId).collect()).prop_shuffle()
}
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
let vals: Vec<usize> = (0..n).collect();
subsequence(vals, 0..=n).prop_shuffle()
}
fn build_columnar_with_mapping(
docs: &[Vec<(&'static str, ColumnValue)>],
old_to_new_row_ids_opt: Option<&[RowId]>,
) -> ColumnarReader {
let num_docs = docs.len() as u32;
let mut buffer = Vec::new();
let mut columnar_writer = ColumnarWriter::default();
for (doc_id, vals) in docs.iter().enumerate() {
for (column_name, col_val) in vals {
match *col_val {
ColumnValue::Str(str_val) => {
columnar_writer.record_str(doc_id as u32, column_name, str_val);
}
ColumnValue::Bytes(bytes) => {
columnar_writer.record_bytes(doc_id as u32, column_name, bytes)
}
ColumnValue::Numerical(num) => {
columnar_writer.record_numerical(doc_id as u32, column_name, num);
}
ColumnValue::IpAddr(ip_addr) => {
columnar_writer.record_ip_addr(doc_id as u32, column_name, ip_addr);
}
ColumnValue::Bool(bool_val) => {
columnar_writer.record_bool(doc_id as u32, column_name, bool_val);
}
ColumnValue::DateTime(date_time) => {
columnar_writer.record_datetime(doc_id as u32, column_name, date_time);
}
}
}
}
columnar_writer
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
.unwrap();
ColumnarReader::open(buffer).unwrap()
}
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
build_columnar_with_mapping(docs, None)
}
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
assert_columnar_eq(left, right, false);
}
fn assert_columnar_eq(
left: &ColumnarReader,
right: &ColumnarReader,
lenient_on_numerical_value: bool,
) {
assert_eq!(left.num_rows(), right.num_rows());
let left_columns = left.list_columns().unwrap();
let right_columns = right.list_columns().unwrap();
assert_eq!(left_columns.len(), right_columns.len());
for i in 0..left_columns.len() {
assert_eq!(left_columns[i].0, right_columns[i].0);
let left_column = left_columns[i].1.open().unwrap();
let right_column = right_columns[i].1.open().unwrap();
assert_dyn_column_eq(&left_column, &right_column, lenient_on_numerical_value);
}
}
fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
left: &Column<T>,
right: &Column<T>,
) {
assert_eq!(left.get_cardinality(), right.get_cardinality());
assert_eq!(left.num_docs(), right.num_docs());
let num_docs = left.num_docs();
for doc in 0..num_docs {
assert_eq!(
left.index.value_row_ids(doc),
right.index.value_row_ids(doc)
);
}
assert_eq!(left.values.num_vals(), right.values.num_vals());
let num_vals = left.values.num_vals();
for i in 0..num_vals {
assert_eq!(left.values.get_val(i), right.values.get_val(i));
}
}
fn assert_bytes_column_eq(left: &BytesColumn, right: &BytesColumn) {
assert_eq!(
left.term_ord_column.get_cardinality(),
right.term_ord_column.get_cardinality()
);
assert_eq!(left.num_rows(), right.num_rows());
assert_column_eq(&left.term_ord_column, &right.term_ord_column);
assert_eq!(left.dictionary.num_terms(), right.dictionary.num_terms());
let num_terms = left.dictionary.num_terms();
let mut left_terms = left.dictionary.stream().unwrap();
let mut right_terms = right.dictionary.stream().unwrap();
for _ in 0..num_terms {
assert!(left_terms.advance());
assert!(right_terms.advance());
assert_eq!(left_terms.key(), right_terms.key());
}
assert!(!left_terms.advance());
assert!(!right_terms.advance());
}
fn assert_dyn_column_eq(
left_dyn_column: &DynamicColumn,
right_dyn_column: &DynamicColumn,
lenient_on_numerical_value: bool,
) {
assert_eq!(
&left_dyn_column.get_cardinality(),
&right_dyn_column.get_cardinality()
);
match &(left_dyn_column, right_dyn_column) {
(DynamicColumn::Bool(left_col), DynamicColumn::Bool(right_col)) => {
assert_column_eq(left_col, right_col);
}
(DynamicColumn::I64(left_col), DynamicColumn::I64(right_col)) => {
assert_column_eq(left_col, right_col);
}
(DynamicColumn::U64(left_col), DynamicColumn::U64(right_col)) => {
assert_column_eq(left_col, right_col);
}
(DynamicColumn::F64(left_col), DynamicColumn::F64(right_col)) => {
assert_column_eq(left_col, right_col);
}
(DynamicColumn::DateTime(left_col), DynamicColumn::DateTime(right_col)) => {
assert_column_eq(left_col, right_col);
}
(DynamicColumn::IpAddr(left_col), DynamicColumn::IpAddr(right_col)) => {
assert_column_eq(left_col, right_col);
}
(DynamicColumn::Bytes(left_col), DynamicColumn::Bytes(right_col)) => {
assert_bytes_column_eq(left_col, right_col);
}
(DynamicColumn::Str(left_col), DynamicColumn::Str(right_col)) => {
assert_bytes_column_eq(left_col, right_col);
}
(left, right) => {
if lenient_on_numerical_value {
assert_eq!(
ColumnTypeCategory::from(left.column_type()),
ColumnTypeCategory::from(right.column_type())
);
} else {
panic!(
"Column type are not the same: {:?} vs {:?}",
left.column_type(),
right.column_type()
);
}
}
}
}
trait AssertEqualToColumnValue {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue);
}
impl AssertEqualToColumnValue for bool {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Bool(val) = column_value else {
panic!()
};
assert_eq!(self, val);
}
}
impl AssertEqualToColumnValue for Ipv6Addr {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::IpAddr(val) = column_value else {
panic!()
};
assert_eq!(self, val);
}
}
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Numerical(num) = column_value else {
panic!()
};
assert_eq!(self, &T::coerce(*num));
}
}
impl AssertEqualToColumnValue for DateTime {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::DateTime(dt) = column_value else {
panic!()
};
assert_eq!(self, dt);
}
}
fn assert_column_values<
T: AssertEqualToColumnValue + PartialEq + Copy + PartialOrd + Debug + Send + Sync + 'static,
>(
col: &Column<T>,
expected: &HashMap<u32, Vec<&ColumnValue>>,
) {
let mut num_non_empty_rows = 0;
for doc in 0..col.num_docs() {
let doc_vals: Vec<T> = col.values_for_doc(doc).collect();
if doc_vals.is_empty() {
continue;
}
num_non_empty_rows += 1;
let expected_vals = expected.get(&doc).unwrap();
assert_eq!(doc_vals.len(), expected_vals.len());
for (val, &expected) in doc_vals.iter().zip(expected_vals.iter()) {
val.assert_equal_to_column_value(expected)
}
}
assert_eq!(num_non_empty_rows, expected.len());
}
fn assert_bytes_column_values(
col: &BytesColumn,
expected: &HashMap<u32, Vec<&ColumnValue>>,
is_str: bool,
) {
let mut num_non_empty_rows = 0;
let mut buffer = Vec::new();
for doc in 0..col.term_ord_column.num_docs() {
let doc_vals: Vec<u64> = col.term_ords(doc).collect();
if doc_vals.is_empty() {
continue;
}
let expected_vals = expected.get(&doc).unwrap();
assert_eq!(doc_vals.len(), expected_vals.len());
for (&expected_col_val, &ord) in expected_vals.iter().zip(&doc_vals) {
col.ord_to_bytes(ord, &mut buffer).unwrap();
match expected_col_val {
ColumnValue::Str(str_val) => {
assert!(is_str);
assert_eq!(str_val.as_bytes(), &buffer);
}
ColumnValue::Bytes(bytes_val) => {
assert!(!is_str);
assert_eq!(bytes_val, &buffer);
}
_ => {
panic!();
}
}
}
num_non_empty_rows += 1;
}
assert_eq!(num_non_empty_rows, expected.len());
}
// This proptest attempts to create a tiny columnar based of up to 3 rows, and checks that the
// resulting columnar matches the row data.
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_single_columnar_builder_proptest(docs in columnar_docs_strategy()) {
let columnar = build_columnar(&docs[..]);
assert_eq!(columnar.num_rows() as usize, docs.len());
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
for (doc_id, doc_vals) in docs.iter().enumerate() {
for (col_name, col_val) in doc_vals {
expected_columns
.entry((col_name, col_val.column_type_category()))
.or_default()
.entry(doc_id as u32)
.or_default()
.push(col_val);
}
}
let column_list = columnar.list_columns().unwrap();
assert_eq!(expected_columns.len(), column_list.len());
for (column_name, column) in column_list {
let dynamic_column = column.open().unwrap();
let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
match &dynamic_column {
DynamicColumn::Bool(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::I64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::U64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::F64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::IpAddr(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::DateTime(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::Bytes(col) =>
assert_bytes_column_values(col, expected_col_values, false),
DynamicColumn::Str(col) =>
assert_bytes_column_values(col, expected_col_values, true),
}
}
}
}
// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
assert_eq!(columnar.num_rows() as usize, docs.len());
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
for (doc_id, doc_vals) in docs.iter().enumerate() {
for (col_name, col_val) in doc_vals {
expected_columns
.entry((col_name, col_val.column_type_category()))
.or_default()
.entry(mapping[doc_id])
.or_default()
.push(col_val);
}
}
let column_list = columnar.list_columns().unwrap();
assert_eq!(expected_columns.len(), column_list.len());
for (column_name, column) in column_list {
let dynamic_column = column.open().unwrap();
let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
for _doc_id in 0..columnar.num_rows() {
match &dynamic_column {
DynamicColumn::Bool(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::I64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::U64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::F64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::IpAddr(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::DateTime(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::Bytes(col) =>
assert_bytes_column_values(col, expected_col_values, false),
DynamicColumn::Str(col) =>
assert_bytes_column_values(col, expected_col_values, true),
}
}
}
}
}
// This tests create 2 or 3 random small columnar and attempts to merge them.
// It compares the resulting merged dataframe with what would have been obtained by building the
// dataframe from the concatenated rows to begin with.
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
}
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
#[test]
fn test_columnar_merging_number_columns() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
// columnar 1
vec![
// doc 1.1
vec![("c2", ColumnValue::Numerical(0i64.into()))],
],
// columnar2
vec![
// doc 2.1
vec![("c2", ColumnValue::Numerical(0u64.into()))],
// doc 2.2
vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
],
];
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
// TODO add non trivial remap and merge
// TODO test required_columns
// TODO document edge case: required_columns incompatible with values.
fn columnar_docs_and_remap(
) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
|columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
let row_addrs: Vec<RowAddr> = columnars_docs
.iter()
.enumerate()
.flat_map(|(segment_ord, columnar_docs)| {
(0u32..columnar_docs.len() as u32).map(move |row_id| RowAddr {
segment_ord: segment_ord as u32,
row_id,
})
})
.collect();
permutation_and_subset_strategy(row_addrs.len()).prop_map(move |shuffled_subset| {
let shuffled_row_addr_subset: Vec<RowAddr> =
shuffled_subset.iter().map(|ord| row_addrs[*ord]).collect();
(columnars_docs.clone(), shuffled_row_addr_subset)
})
},
)
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
.collect();
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
}
}
#[test]
fn test_columnar_merge_empty() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = vec![0, 0];
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, vec![]);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 0);
assert_eq!(merged_columnar.num_columns(), 0);
}
#[test]
fn test_columnar_merge_single_str_column() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = vec![0, 1];
let shuffle_merge_order = ShuffleMergeOrder::for_test(
&segment_num_rows,
vec![RowAddr {
segment_ord: 1u32,
row_id: 0u32,
}],
);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1);
}
#[test]
fn test_delete_decrease_cardinality() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[
vec![
("c", ColumnValue::from(0i64)),
("c", ColumnValue::from(0i64)),
],
vec![("c", ColumnValue::from(0i64))],
][..];
// c is multivalued here
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let shuffle_merge_order = ShuffleMergeOrder::for_test(
&[0, 2],
vec![RowAddr {
segment_ord: 1u32,
row_id: 1u32,
}],
);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1);
let cols = merged_columnar.read_columns("c").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].column_type(), ColumnType::I64);
assert_eq!(cols[0].open().unwrap().get_cardinality(), Cardinality::Full);
}

View File

@@ -109,7 +109,7 @@ impl Coerce for f64 {
impl Coerce for DateTime {
fn coerce(value: NumericalValue) -> Self {
let timestamp_micros = i64::coerce(value);
DateTime::from_timestamp_nanos(timestamp_micros)
DateTime::from_timestamp_micros(timestamp_micros)
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-common"
version = "0.6.0"
version = "0.5.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2021"
@@ -14,7 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version= "0.6", path="../ownedbytes" }
ownedbytes = { version= "0.5", path="../ownedbytes" }
async-trait = "0.1"
time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] }

View File

@@ -1,39 +0,0 @@
#![feature(test)]
extern crate test;
#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::serialize_vint_u32;
use test::Bencher;
#[bench]
fn bench_vint(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).collect();
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
#[bench]
fn bench_vint_rand(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
}

View File

@@ -4,8 +4,6 @@ use std::{fmt, io, u64};
use ownedbytes::OwnedBytes;
use crate::ByteCount;
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct TinySet(u64);
@@ -388,8 +386,8 @@ impl ReadOnlyBitSet {
}
/// Number of bytes used in the bitset representation.
pub fn num_bytes(&self) -> ByteCount {
self.data.len().into()
pub fn num_bytes(&self) -> usize {
self.data.len()
}
}

View File

@@ -1,114 +0,0 @@
use std::iter::Sum;
use std::ops::{Add, AddAssign};
use serde::{Deserialize, Serialize};
/// Indicates space usage in bytes
#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct ByteCount(u64);
impl std::fmt::Debug for ByteCount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.human_readable())
}
}
impl std::fmt::Display for ByteCount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.human_readable())
}
}
const SUFFIX_AND_THRESHOLD: [(&str, u64); 5] = [
("KB", 1_000),
("MB", 1_000_000),
("GB", 1_000_000_000),
("TB", 1_000_000_000_000),
("PB", 1_000_000_000_000_000),
];
impl ByteCount {
#[inline]
pub fn get_bytes(&self) -> u64 {
self.0
}
pub fn human_readable(&self) -> String {
for (suffix, threshold) in SUFFIX_AND_THRESHOLD.iter().rev() {
if self.get_bytes() >= *threshold {
let unit_num = self.get_bytes() as f64 / *threshold as f64;
return format!("{unit_num:.2} {suffix}");
}
}
format!("{:.2} B", self.get_bytes())
}
}
impl From<u64> for ByteCount {
fn from(value: u64) -> Self {
ByteCount(value)
}
}
impl From<usize> for ByteCount {
fn from(value: usize) -> Self {
ByteCount(value as u64)
}
}
impl Sum for ByteCount {
#[inline]
fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
iter.fold(ByteCount::default(), |acc, x| acc + x)
}
}
impl PartialEq<u64> for ByteCount {
#[inline]
fn eq(&self, other: &u64) -> bool {
self.get_bytes() == *other
}
}
impl PartialOrd<u64> for ByteCount {
#[inline]
fn partial_cmp(&self, other: &u64) -> Option<std::cmp::Ordering> {
self.get_bytes().partial_cmp(other)
}
}
impl Add for ByteCount {
type Output = Self;
#[inline]
fn add(self, other: Self) -> Self {
Self(self.get_bytes() + other.get_bytes())
}
}
impl AddAssign for ByteCount {
#[inline]
fn add_assign(&mut self, other: Self) {
*self = Self(self.get_bytes() + other.get_bytes());
}
}
#[cfg(test)]
mod test {
use crate::ByteCount;
#[test]
fn test_bytes() {
assert_eq!(ByteCount::from(0u64).human_readable(), "0 B");
assert_eq!(ByteCount::from(300u64).human_readable(), "300 B");
assert_eq!(ByteCount::from(1_000_000u64).human_readable(), "1.00 MB");
assert_eq!(ByteCount::from(1_500_000u64).human_readable(), "1.50 MB");
assert_eq!(
ByteCount::from(1_500_000_000u64).human_readable(),
"1.50 GB"
);
assert_eq!(
ByteCount::from(3_213_000_000_000u64).human_readable(),
"3.21 TB"
);
}
}

View File

@@ -1,36 +1,25 @@
#![allow(deprecated)]
use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
/// DateTime Precision
#[derive(
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "lowercase")]
pub enum DateTimePrecision {
/// Second precision.
pub enum DatePrecision {
/// Seconds precision
#[default]
Seconds,
/// Millisecond precision.
/// Milli-seconds precision.
Milliseconds,
/// Microsecond precision.
/// Micro-seconds precision.
Microseconds,
/// Nanosecond precision.
Nanoseconds,
}
#[deprecated(since = "0.20.0", note = "Use `DateTimePrecision` instead")]
pub type DatePrecision = DateTimePrecision;
/// A date/time value with nanoseconds precision.
/// A date/time value with microsecond precision.
///
/// This timestamp does not carry any explicit time zone information.
/// Users are responsible for applying the provided conversion
@@ -40,48 +29,31 @@ pub type DatePrecision = DateTimePrecision;
/// All constructors and conversions are provided as explicit
/// functions and not by implementing any `From`/`Into` traits
/// to prevent unintended usage.
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct DateTime {
// Timestamp in nanoseconds.
pub(crate) timestamp_nanos: i64,
// Timestamp in microseconds.
pub(crate) timestamp_micros: i64,
}
impl DateTime {
/// Minimum possible `DateTime` value.
pub const MIN: DateTime = DateTime {
timestamp_nanos: i64::MIN,
};
/// Maximum possible `DateTime` value.
pub const MAX: DateTime = DateTime {
timestamp_nanos: i64::MAX,
};
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
Self {
timestamp_nanos: seconds * 1_000_000_000,
timestamp_micros: seconds * 1_000_000,
}
}
/// Create new from UNIX timestamp in milliseconds
pub const fn from_timestamp_millis(milliseconds: i64) -> Self {
Self {
timestamp_nanos: milliseconds * 1_000_000,
timestamp_micros: milliseconds * 1_000,
}
}
/// Create new from UNIX timestamp in microseconds.
pub const fn from_timestamp_micros(microseconds: i64) -> Self {
Self {
timestamp_nanos: microseconds * 1_000,
}
}
/// Create new from UNIX timestamp in nanoseconds.
pub const fn from_timestamp_nanos(nanoseconds: i64) -> Self {
Self {
timestamp_nanos: nanoseconds,
timestamp_micros: microseconds,
}
}
@@ -89,9 +61,9 @@ impl DateTime {
///
/// The given date/time is converted to UTC and the actual
/// time zone is discarded.
pub fn from_utc(dt: OffsetDateTime) -> Self {
let timestamp_nanos = dt.unix_timestamp_nanos() as i64;
Self { timestamp_nanos }
pub const fn from_utc(dt: OffsetDateTime) -> Self {
let timestamp_micros = dt.unix_timestamp() * 1_000_000 + dt.microsecond() as i64;
Self { timestamp_micros }
}
/// Create new from `PrimitiveDateTime`
@@ -105,27 +77,23 @@ impl DateTime {
/// Convert to UNIX timestamp in seconds.
pub const fn into_timestamp_secs(self) -> i64 {
self.timestamp_nanos / 1_000_000_000
self.timestamp_micros / 1_000_000
}
/// Convert to UNIX timestamp in milliseconds.
pub const fn into_timestamp_millis(self) -> i64 {
self.timestamp_nanos / 1_000_000
self.timestamp_micros / 1_000
}
/// Convert to UNIX timestamp in microseconds.
pub const fn into_timestamp_micros(self) -> i64 {
self.timestamp_nanos / 1_000
}
/// Convert to UNIX timestamp in nanoseconds.
pub const fn into_timestamp_nanos(self) -> i64 {
self.timestamp_nanos
self.timestamp_micros
}
/// Convert to UTC `OffsetDateTime`
pub fn into_utc(self) -> OffsetDateTime {
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(self.timestamp_nanos as i128)
let timestamp_nanos = self.timestamp_micros as i128 * 1000;
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(timestamp_nanos)
.expect("valid UNIX timestamp");
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
utc_datetime
@@ -148,34 +116,21 @@ impl DateTime {
}
/// Truncates the microseconds value to the corresponding precision.
pub fn truncate(self, precision: DateTimePrecision) -> Self {
pub fn truncate(self, precision: DatePrecision) -> Self {
let truncated_timestamp_micros = match precision {
DateTimePrecision::Seconds => (self.timestamp_nanos / 1_000_000_000) * 1_000_000_000,
DateTimePrecision::Milliseconds => (self.timestamp_nanos / 1_000_000) * 1_000_000,
DateTimePrecision::Microseconds => (self.timestamp_nanos / 1_000) * 1_000,
DateTimePrecision::Nanoseconds => self.timestamp_nanos,
DatePrecision::Seconds => (self.timestamp_micros / 1_000_000) * 1_000_000,
DatePrecision::Milliseconds => (self.timestamp_micros / 1_000) * 1_000,
DatePrecision::Microseconds => self.timestamp_micros,
};
Self {
timestamp_nanos: truncated_timestamp_micros,
timestamp_micros: truncated_timestamp_micros,
}
}
}
impl fmt::Debug for DateTime {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
f.write_str(&utc_rfc3339)
}
}
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -1,4 +1,3 @@
use std::fs::File;
use std::ops::{Deref, Range, RangeBounds};
use std::sync::Arc;
use std::{fmt, io};
@@ -6,7 +5,7 @@ use std::{fmt, io};
use async_trait::async_trait;
use ownedbytes::{OwnedBytes, StableDeref};
use crate::{ByteCount, HasLen};
use crate::HasLen;
/// Objects that represents files sections in tantivy.
///
@@ -33,62 +32,6 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + fmt::Debug {
}
}
#[derive(Debug)]
/// A File with it's length included.
pub struct WrapFile {
file: File,
len: usize,
}
impl WrapFile {
/// Creates a new WrapFile and stores its length.
pub fn new(file: File) -> io::Result<Self> {
let len = file.metadata()?.len() as usize;
Ok(WrapFile { file, len })
}
}
#[async_trait]
impl FileHandle for WrapFile {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
let file_len = self.len();
// Calculate the actual range to read, ensuring it stays within file boundaries
let start = range.start;
let end = range.end.min(file_len);
// Ensure the start is before the end of the range
if start >= end {
return Err(io::Error::new(io::ErrorKind::InvalidInput, "Invalid range"));
}
let mut buffer = vec![0; end - start];
#[cfg(unix)]
{
use std::os::unix::prelude::FileExt;
self.file.read_exact_at(&mut buffer, start as u64)?;
}
#[cfg(not(unix))]
{
use std::io::{Read, Seek};
let mut file = self.file.try_clone()?; // Clone the file to read from it separately
// Seek to the start position in the file
file.seek(io::SeekFrom::Start(start as u64))?;
// Read the data into the buffer
file.read_exact(&mut buffer)?;
}
Ok(OwnedBytes::new(buffer))
}
// todo implement async
}
impl HasLen for WrapFile {
fn len(&self) -> usize {
self.len
}
}
#[async_trait]
impl FileHandle for &'static [u8] {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
@@ -124,30 +67,6 @@ impl fmt::Debug for FileSlice {
}
}
impl FileSlice {
pub fn stream_file_chunks(&self) -> impl Iterator<Item = io::Result<OwnedBytes>> + '_ {
let len = self.range.end;
let mut start = self.range.start;
std::iter::from_fn(move || {
/// Returns chunks of 1MB of data from the FileHandle.
const CHUNK_SIZE: usize = 1024 * 1024; // 1MB
if start < len {
let end = (start + CHUNK_SIZE).min(len);
let range = start..end;
let chunk = self.data.read_bytes(range);
start += CHUNK_SIZE;
match chunk {
Ok(chunk) => Some(Ok(chunk)),
Err(e) => Some(Err(e)),
}
} else {
None
}
})
}
}
/// Takes a range, a `RangeBounds` object, and returns
/// a `Range` that corresponds to the relative application of the
/// `RangeBounds` object to the original `Range`.
@@ -297,11 +216,6 @@ impl FileSlice {
pub fn slice_to(&self, to_offset: usize) -> FileSlice {
self.slice(0..to_offset)
}
/// Returns the byte count of the FileSlice.
pub fn num_bytes(&self) -> ByteCount {
self.range.len().into()
}
}
#[async_trait]

View File

@@ -27,15 +27,15 @@ pub trait GroupByIteratorExtended: Iterator {
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: PartialEq + Clone,
Self::Item: Clone,
K: PartialEq + Copy,
Self::Item: Copy,
{
GroupByIterator::new(self, key)
}
}
impl<I: Iterator> GroupByIteratorExtended for I {}
pub struct GroupByIterator<I, F, K: Clone>
pub struct GroupByIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -50,7 +50,7 @@ where
inner: Rc<RefCell<GroupByShared<I, F, K>>>,
}
struct GroupByShared<I, F, K: Clone>
struct GroupByShared<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -63,7 +63,7 @@ impl<I, F, K> GroupByIterator<I, F, K>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
K: Clone,
K: Copy,
{
fn new(inner: I, group_by_fn: F) -> Self {
let inner = GroupByShared {
@@ -80,28 +80,28 @@ where
impl<I, F, K> Iterator for GroupByIterator<I, F, K>
where
I: Iterator,
I::Item: Clone,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
K: Clone,
K: Copy,
{
type Item = (K, GroupIterator<I, F, K>);
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
let value = inner.iter.peek()?.clone();
let value = *inner.iter.peek()?;
let key = (inner.group_by_fn)(&value);
let inner = self.inner.clone();
let group_iter = GroupIterator {
inner,
group_key: key.clone(),
group_key: key,
};
Some((key, group_iter))
}
}
pub struct GroupIterator<I, F, K: Clone>
pub struct GroupIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -110,10 +110,10 @@ where
group_key: K,
}
impl<I, F, K: PartialEq + Clone> Iterator for GroupIterator<I, F, K>
impl<I, F, K: PartialEq + Copy> Iterator for GroupIterator<I, F, K>
where
I: Iterator,
I::Item: Clone,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
{
type Item = I::Item;
@@ -121,7 +121,7 @@ where
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
// peek if next value is in group
let peek_val = inner.iter.peek()?.clone();
let peek_val = *inner.iter.peek()?;
if (inner.group_by_fn)(&peek_val) == self.group_key {
inner.iter.next()
} else {

View File

@@ -1,112 +0,0 @@
use crate::replace_in_place;
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
#[derive(Clone, Debug, Default)]
pub struct JsonPathWriter {
path: String,
indices: Vec<usize>,
expand_dots: bool,
}
impl JsonPathWriter {
pub fn new() -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots: false,
}
}
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
#[inline]
pub fn set_expand_dots(&mut self, expand_dots: bool) {
self.expand_dots = expand_dots;
}
/// Push a new segment to the path.
#[inline]
pub fn push(&mut self, segment: &str) {
let len_path = self.path.len();
self.indices.push(len_path);
if !self.path.is_empty() {
self.path.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
self.path.push_str(segment);
if self.expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut self.path[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe {
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
};
}
}
/// Remove the last segment. Does nothing if the path is empty.
#[inline]
pub fn pop(&mut self) {
if let Some(last_idx) = self.indices.pop() {
self.path.truncate(last_idx);
}
}
/// Clear the path.
#[inline]
pub fn clear(&mut self) {
self.path.clear();
self.indices.clear();
}
/// Get the current path.
#[inline]
pub fn as_str(&self) -> &str {
&self.path
}
}
impl From<JsonPathWriter> for String {
#[inline]
fn from(value: JsonPathWriter) -> Self {
value.path
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_path_writer_test() {
let mut writer = JsonPathWriter::new();
writer.push("root");
assert_eq!(writer.as_str(), "root");
writer.push("child");
assert_eq!(writer.as_str(), "root\u{1}child");
writer.pop();
assert_eq!(writer.as_str(), "root");
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");
writer.set_expand_dots(true);
writer.pop();
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
}
}

View File

@@ -5,25 +5,20 @@ use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod byte_count;
mod datetime;
pub mod file_slice;
mod group_by;
mod json_path_writer;
mod serialize;
mod vint;
mod writer;
pub use bitset::*;
pub use byte_count::ByteCount;
#[allow(deprecated)]
pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision};
pub use datetime::{DatePrecision, DateTime};
pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128,
serialize_vint_u32, write_u32_vint, VInt, VIntU128,
};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
@@ -114,22 +109,6 @@ pub fn u64_to_f64(val: u64) -> f64 {
})
}
/// Replaces a given byte in the `bytes` slice of bytes.
///
/// This function assumes that the needle is rarely contained in the bytes string
/// and offers a fast path if the needle is not present.
#[inline]
pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
if !bytes.contains(&needle) {
return;
}
for b in bytes {
if *b == needle {
*b = replacement;
}
}
}
#[cfg(test)]
pub mod test {
@@ -194,20 +173,4 @@ pub mod test {
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
#[test]
fn test_replace_in_place() {
let test_aux = |before_replacement: &[u8], expected: &[u8]| {
let mut bytes: Vec<u8> = before_replacement.to_vec();
super::replace_in_place(b'b', b'c', &mut bytes);
assert_eq!(&bytes[..], expected);
};
test_aux(b"", b"");
test_aux(b"b", b"c");
test_aux(b"baaa", b"caaa");
test_aux(b"aaab", b"aaac");
test_aux(b"aaabaa", b"aaacaa");
test_aux(b"aaaaaa", b"aaaaaa");
test_aux(b"bbbb", b"cccc");
}
}

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write};
use std::{fmt, io};
@@ -250,43 +249,6 @@ impl BinarySerializable for String {
}
}
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)]
pub mod test {

View File

@@ -1,6 +1,8 @@
use std::io;
use std::io::{Read, Write};
use byteorder::{ByteOrder, LittleEndian};
use super::BinarySerializable;
/// Variable int serializes a u128 number
@@ -17,6 +19,26 @@ pub fn serialize_vint_u128(mut val: u128, output: &mut Vec<u8>) {
}
}
/// Deserializes a u128 number
///
/// Returns the number and the slice after the vint
pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> {
let mut result = 0u128;
let mut shift = 0u64;
for i in 0..19 {
let b = data[i];
result |= u128::from(b % 128u8) << shift;
if b >= STOP_BIT {
return Ok((result, &data[i + 1..]));
}
shift += 7;
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Failed to deserialize u128 vint",
))
}
/// Wrapper over a `u128` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct VIntU128(pub u128);
@@ -58,13 +80,17 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
#[inline]
pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
@@ -73,29 +99,25 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
let (res, num_bytes) = if val < START_2 {
(val | STOP_BIT, 1)
} else if val < START_3 {
(
let (res, num_bytes) = match val {
0..=STOP_1 => (val | STOP_BIT, 1),
START_2..=STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
)
} else if val < START_4 {
(
),
START_3..=STOP_3 => (
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
)
} else if val < START_5 {
(
),
START_4..=STOP_4 => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
)
} else {
(
),
_ => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
@@ -103,9 +125,9 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
)
),
};
*buf = res.to_le_bytes();
LittleEndian::write_u64(&mut buf[..], res);
&buf[0..num_bytes]
}
@@ -223,6 +245,7 @@ impl BinarySerializable for VInt {
mod tests {
use super::{serialize_vint_u32, BinarySerializable, VInt};
use crate::vint::{deserialize_vint_u128, serialize_vint_u128, VIntU128};
fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];
@@ -261,7 +284,27 @@ mod tests {
let mut buffer2 = [0u8; 8];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let res2 = serialize_vint_u32(val, &mut buffer2);
assert_eq!(&buffer[..len_vint], res2, "array wrong for {val}");
assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val);
}
fn aux_test_vint_u128(val: u128) {
let mut data = vec![];
serialize_vint_u128(val, &mut data);
let (deser_val, _data) = deserialize_vint_u128(&data).unwrap();
assert_eq!(val, deser_val);
let mut out = vec![];
VIntU128(val).serialize(&mut out).unwrap();
let deser_val = VIntU128::deserialize(&mut &out[..]).unwrap();
assert_eq!(val, deser_val.0);
}
#[test]
fn test_vint_u128() {
aux_test_vint_u128(0);
aux_test_vint_u128(1);
aux_test_vint_u128(u128::MAX / 3);
aux_test_vint_u128(u128::MAX);
}
#[test]

View File

@@ -1,266 +1,129 @@
// # Aggregation example
//
// This example shows how you can use built-in aggregations.
// We will use nested aggregations with buckets and metrics:
// - Range buckets and compute the average in each bucket.
// - Term aggregation and compute the min price in each bucket
// ---
// We will use range buckets and compute the average in each bucket.
//
use serde_json::{Deserializer, Value};
use tantivy::aggregation::agg_req::Aggregations;
use serde_json::Value;
use tantivy::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
RangeAggregation,
};
use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::metric::AverageAggregation;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::query::TermQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::{doc, Index, Term};
fn main() -> tantivy::Result<()> {
// # Create Schema
//
// Lets create a schema for a footwear shop, with 4 fields: name, category, stock and price.
// category, stock and price will be fast fields as that's the requirement
// for aggregation queries.
//
let mut schema_builder = Schema::builder();
// In preparation of the `TermsAggregation`, the category field is configured with:
// - `set_fast`
// - `raw` tokenizer
//
// The tokenizer is set to "raw", because the fast field uses the same dictionary as the
// inverted index. (This behaviour will change in tantivy 0.20, where the fast field will
// always be raw tokenized independent from the regular tokenizing)
//
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast(None)
.set_stored();
schema_builder.add_text_field("category", text_fieldtype);
schema_builder.add_f64_field("stock", FAST);
schema_builder.add_f64_field("price", FAST);
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
let price_field = schema_builder.add_f64_field("price", score_fieldtype);
let schema = schema_builder.build();
// # Indexing documents
//
// Lets index a bunch of documents for this example.
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let data = r#"{
"name": "Almond Toe Court Shoes, Patent Black",
"category": "Womens Footwear",
"price": 99.00,
"stock": 5
}
{
"name": "Suede Shoes, Blue",
"category": "Womens Footwear",
"price": 42.00,
"stock": 4
}
{
"name": "Leather Driver Saddle Loafers, Tan",
"category": "Mens Footwear",
"price": 34.00,
"stock": 12
}
{
"name": "Flip Flops, Red",
"category": "Mens Footwear",
"price": 19.00,
"stock": 6
}
{
"name": "Flip Flops, Blue",
"category": "Mens Footwear",
"price": 19.00,
"stock": 0
}
{
"name": "Gold Button Cardigan, Black",
"category": "Womens Casualwear",
"price": 167.00,
"stock": 6
}
{
"name": "Cotton Shorts, Medium Red",
"category": "Womens Casualwear",
"price": 30.00,
"stock": 5
}
{
"name": "Fine Stripe Short SleeveShirt, Grey",
"category": "Mens Casualwear",
"price": 49.99,
"stock": 9
}
{
"name": "Fine Stripe Short SleeveShirt, Green",
"category": "Mens Casualwear",
"price": 49.99,
"offer": 39.99,
"stock": 9
}
{
"name": "Sharkskin Waistcoat, Charcoal",
"category": "Mens Formalwear",
"price": 75.00,
"stock": 2
}
{
"name": "Lightweight Patch PocketBlazer, Deer",
"category": "Mens Formalwear",
"price": 175.50,
"stock": 1
}
{
"name": "Bird Print Dress, Black",
"category": "Womens Formalwear",
"price": 270.00,
"stock": 10
}
{
"name": "Mid Twist Cut-Out Dress, Pink",
"category": "Womens Formalwear",
"price": 540.00,
"stock": 5
}"#;
let mut index_writer = index.writer(50_000_000)?;
// writing the segment
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 1f64,
price_field => 0f64,
))?;
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 3f64,
price_field => 1f64,
))?;
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 5f64,
price_field => 1f64,
))?;
index_writer.add_document(doc!(
text_field => "nohit",
highscore_field => 6f64,
price_field => 2f64,
))?;
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 7f64,
price_field => 2f64,
))?;
index_writer.commit()?;
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 11f64,
price_field => 10f64,
))?;
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 14f64,
price_field => 15f64,
))?;
let stream = Deserializer::from_str(data).into_iter::<Value>();
index_writer.add_document(doc!(
text_field => "cool",
highscore_field => 15f64,
price_field => 20f64,
))?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut num_indexed = 0;
for value in stream {
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?;
num_indexed += 1;
if num_indexed > 4 {
// Writing the first segment
index_writer.commit()?;
}
}
// Writing the second segment
index_writer.commit()?;
// We have two segments now. The `AggregationCollector` will run the aggregation on each
// segment and then merge the results into an `IntermediateAggregationResult`.
let reader = index.reader()?;
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let sub_agg_req_1: Aggregations = vec![(
"average_price".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("price".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![(
"score_ranges".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "highscore".to_string(),
ranges: vec![
(-1f64..9f64).into(),
(9f64..14f64).into(),
(14f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req_1,
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher();
// ---
// # Aggregation Query
//
//
// We can construct the query by building the request structure or by deserializing from JSON.
// The JSON API is more stable and therefore recommended.
//
// ## Request 1
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let agg_req_str = r#"
{
"group_by_stock": {
"aggs": {
"average_price": { "avg": { "field": "price" } }
},
"range": {
"field": "stock",
"ranges": [
{ "key": "few", "to": 1.0 },
{ "key": "some", "from": 1.0, "to": 10.0 },
{ "key": "many", "from": 10.0 }
]
}
}
} "#;
// In this Aggregation we want to get the average price for different groups, depending on how
// many items are in stock. We define custom ranges `few`, `some`, `many` via the
// range aggregation.
// For every bucket we want the average price, so we create a nested metric aggregation on the
// range bucket aggregation. Only buckets support nested aggregations.
// ### Request JSON API
//
let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
// We use the `AllQuery` which will pass all documents to the AggregationCollector.
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::to_value(agg_res)?;
// ### Aggregation Result
//
// The resulting structure deserializes in the same JSON format as elastic search.
//
let expected_res = r#"
{
"group_by_stock":{
"buckets":[
{"average_price":{"value":19.0},"doc_count":1,"key":"few","to":1.0},
{"average_price":{"value":124.748},"doc_count":10,"from":1.0,"key":"some","to":10.0},
{"average_price":{"value":152.0},"doc_count":2,"from":10.0,"key":"many"}
]
}
}
"#;
let expected_json: Value = serde_json::from_str(expected_res)?;
assert_eq!(expected_json, res);
// ### Request 2
//
// Now we are interested in the minimum price per category, so we create a bucket per
// category via `TermsAggregation`. We are interested in the highest minimum prices, and set the
// order of the buckets `"order": { "min_price": "desc" }` to be sorted by the the metric of
// the sub aggregation. (awesome)
//
let agg_req_str = r#"
{
"min_price_per_category": {
"aggs": {
"min_price": { "min": { "field": "price" } }
},
"terms": {
"field": "category",
"min_doc_count": 1,
"order": { "min_price": "desc" }
}
}
} "#;
let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::to_value(agg_res)?;
// Minimum price per category, sorted by minimum price descending
//
// As you can see, the starting prices for `Formalwear` are higher than `Casualwear`.
//
let expected_res = r#"
{
"min_price_per_category": {
"buckets": [
{ "doc_count": 2, "key": "Womens Formalwear", "min_price": { "value": 270.0 } },
{ "doc_count": 2, "key": "Mens Formalwear", "min_price": { "value": 75.0 } },
{ "doc_count": 2, "key": "Mens Casualwear", "min_price": { "value": 49.99 } },
{ "doc_count": 2, "key": "Womens Footwear", "min_price": { "value": 42.0 } },
{ "doc_count": 2, "key": "Womens Casualwear", "min_price": { "value": 30.0 } },
{ "doc_count": 3, "key": "Mens Footwear", "min_price": { "value": 19.0 } }
],
"sum_other_doc_count": 0
}
}
"#;
let expected_json: Value = serde_json::from_str(expected_res)?;
assert_eq!(expected_json, res);
println!("{}", serde_json::to_string_pretty(&res)?);
Ok(())
}

View File

@@ -15,7 +15,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
@@ -164,7 +164,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// We now need to acquire a searcher.
@@ -217,23 +217,9 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// We can also get an explanation to understand
// how a found document got its score.
let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
let (_score, doc_address) = searcher
.search(&query, &TopDocs::with_limit(1))?
.into_iter()
.next()
.unwrap();
let explanation = query.explain(&searcher, doc_address)?;
println!("{}", explanation.to_pretty_json());
Ok(())
}

View File

@@ -13,7 +13,7 @@ use columnar::Column;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)]
struct Stats {
@@ -105,7 +105,7 @@ impl SegmentCollector for StatsSegmentCollector {
fn collect(&mut self, doc: u32, _score: Score) {
// Since we know the values are single value, we could call `first_or_default_col` on the
// column and fetch single values.
for value in self.fast_field_reader.values_for_doc(doc) {
for value in self.fast_field_reader.values(doc) {
let value = value as f64;
self.stats.count += 1;
self.stats.sum += value;
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example.
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \
@@ -171,7 +171,7 @@ fn main() -> tantivy::Result<()> {
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
// here we want to search for `broom` and use `StatsCollector` on the hits.
// here we want to get a hit on the 'ken' in Frankenstein
let query = query_parser.parse_query("broom")?;
if let Some(stats) =
searcher.search(&query, &StatsCollector::with_field("price".to_string()))?

View File

@@ -1,12 +1,12 @@
// # Defining a tokenizer pipeline
//
// In this example, we'll see how to define a tokenizer
// by creating a custom `NgramTokenizer`.
// In this example, we'll see how to define a tokenizer pipeline
// by aligning a bunch of `TokenFilter`.
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -53,7 +53,7 @@ fn main() -> tantivy::Result<()> {
// this will store tokens of 3 characters each
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false).unwrap());
.register("ngram3", NgramTokenizer::new(3, 3, false));
// To insert document we need an index writer.
// There must be only one writer at a time.
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
//
// Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -4,8 +4,8 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -13,8 +13,7 @@ fn main() -> tantivy::Result<()> {
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast()
.set_precision(tantivy::DateTimePrecision::Seconds);
// Add `occurred_at` date field type
.set_precision(tantivy::DatePrecision::Seconds);
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);
let schema = schema_builder.build();
@@ -22,18 +21,15 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format
let doc = TantivyDocument::parse_json(
&schema,
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment"
@@ -45,28 +41,26 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?;
let searcher = reader.searcher();
// # Search
// # Default fields: event_type
let query_parser = QueryParser::for_index(&index, vec![event_type]);
{
// Simple exact search on the date
let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
let query = query_parser.parse_query("event:comment")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 1);
}
{
// Range query on the date field
let query = query_parser
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_))
Some(Value::Date(_))
));
assert_eq!(
retrieved_doc.to_json(&schema),
schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
);
}

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader, IndexWriter};
use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document
// given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader, IndexWriter};
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<TantivyDocument>> {
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
// This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!(
isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_doc_misspelled.to_json(&schema),
schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
);
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_new_doc.to_json(&schema),
schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
);

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.

View File

@@ -1,18 +1,9 @@
// # Faceted Search With Tweak Score
//
// This example covers the faceted search functionalities of
// tantivy.
//
// We will :
// - define a text field "name" in our schema
// - define a facet field "classification" in our schema
use std::collections::HashSet;
use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery;
use tantivy::schema::*;
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
@@ -23,7 +14,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!(
title => "Fried egg",
@@ -64,7 +55,6 @@ fn main() -> tantivy::Result<()> {
.collect(),
);
let top_docs_by_custom_score =
// Call TopDocs with a custom tweak score
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
let facet_dict = ingredient_reader.facet_dict();
@@ -75,7 +65,6 @@ fn main() -> tantivy::Result<()> {
.collect();
move |doc: DocId, original_score: Score| {
// Update the original score with a tweaked score
let missing_ingredients = ingredient_reader
.facet_ords(doc)
.filter(|ord| !query_ords.contains(ord))
@@ -91,10 +80,11 @@ fn main() -> tantivy::Result<()> {
.iter()
.map(|(_, doc_id)| {
searcher
.doc::<TantivyDocument>(*doc_id)
.doc(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.as_text()
.unwrap()
.to_owned()
})

View File

@@ -1,167 +0,0 @@
// # Basic Example
//
// This example covers the basic functionalities of
// tantivy.
//
// We will :
// - define our schema
// - create an index in a directory
// - index a few documents into our index
// - search for the best document matching a basic query
// - retrieve the best document's original content.
// ---
// Importing tantivy...
use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the
// sake of this example
let index_path = TempDir::new()?;
// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// be indexed".
// First we need to define a schema ...
let mut schema_builder = Schema::builder();
// Our first field is title.
// We want full-text search for it, and we also want
// to be able to retrieve the document after the search.
//
// `TEXT | STORED` is some syntactic sugar to describe
// that.
//
// `TEXT` means the field should be tokenized and indexed,
// along with its term frequency and term positions.
//
// `STORED` means that the field will also be saved
// in a compressed, row-oriented key-value store.
// This store is useful for reconstructing the
// documents that were selected during the search phase.
let title = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build();
// # Indexing documents
//
// Let's create a brand new index.
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create_in_dir(&index_path, schema.clone())?;
// To insert a document we will need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
//
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
// ### Adding documents
//
index_writer.add_document(doc!(
title => "The Name of the Wind",
))?;
index_writer.add_document(doc!(
title => "The Diary of Muadib",
))?;
index_writer.add_document(doc!(
title => "A Dairy Cow",
))?;
index_writer.add_document(doc!(
title => "The Diary of a Young Girl",
))?;
index_writer.commit()?;
// ### Committing
//
// At this point our documents are not searchable.
//
//
// We need to call `.commit()` explicitly to force the
// `index_writer` to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
// the existence of new documents.
//
// This call is blocking.
index_writer.commit()?;
// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
// persistently indexed.
//
// In the scenario of a crash or a power failure,
// tantivy behaves as if it has rolled back to its last
// commit.
// # Searching
//
// ### Searcher
//
// A reader is required first in order to search an index.
// It acts as a `Searcher` pool that reloads itself,
// depending on a `ReloadPolicy`.
//
// For a search server you will typically create one reader for the entire lifetime of your
// program, and acquire a new searcher for every single request.
//
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
// We now need to acquire a searcher.
//
// A searcher points to a snapshotted, immutable version of the index.
//
// Some search experience might require more than
// one query. Using the same searcher ensures that all of these queries will run on the
// same version of the index.
//
// Acquiring a `searcher` is very cheap.
//
// You should acquire a searcher every time you start processing a request and
// and release it right after your query is finished.
let searcher = reader.searcher();
// ### FuzzyTermQuery
{
let term = Term::from_field_text(title, "Diary");
let query = FuzzyTermQuery::new(term, 2, true);
let (top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(5), Count))
.unwrap();
assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs {
// Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
// score 1.0 doc {"title":["The Diary of Muadib"]}
//
// score 1.0 doc {"title":["The Diary of a Young Girl"]}
//
// score 1.0 doc {"title":["A Dairy Cow"]}
}
}
Ok(())
}

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, Result};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?;
}

View File

@@ -6,14 +6,10 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
// We set the IP field as `INDEXED`, so it can be searched
// `FAST` will create a fast field. The fast field will be used to execute search queries.
// `FAST` is not a requirement for range queries, it can also be executed on the inverted index
// which is created by `INDEXED`.
let mut schema_builder = Schema::builder();
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
let ip = schema_builder.add_ip_addr_field("ip", STORED | INDEXED | FAST);
@@ -22,85 +18,52 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6.
let doc = TantivyDocument::parse_json(
&schema,
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.33",
"event_type": "login"
}"#,
"ip": "192.168.0.33",
"event_type": "login"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.80",
"event_type": "checkout"
}"#,
"ip": "192.168.0.80",
"event_type": "checkout"
}"#,
)?;
index_writer.add_document(doc)?;
// ### IPv6
// Adding a document that contains an IPv6 address.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"
}"#,
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"
}"#,
)?;
index_writer.add_document(doc)?;
// Commit will create a segment containing our documents.
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
// # Search
// Range queries on IPv4. Since we created a fast field, the fast field will be used to execute
// the search.
// ### Range Queries
let query_parser = QueryParser::for_index(&index, vec![event_type, ip]);
{
// Inclusive range queries
let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
let query = query_parser.parse_query("ip:[192.168.0.0 TO 192.168.0.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 1);
assert_eq!(count_docs.len(), 2);
}
{
// Exclusive range queries
let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
let query = query_parser.parse_query("ip:[192.168.1.0 TO 192.168.1.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 0);
}
{
// Find docs with IP addresses smaller equal 192.168.1.100
let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2);
}
{
// Find docs with IP addresses smaller than 192.168.1.100
let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2);
}
// ### Exact Queries
// Exact search on IPv4.
{
let query = query_parser.parse_query("ip:192.168.0.80")?;
let count_docs = searcher.search(&*query, &Count)?;
assert_eq!(count_docs, 1);
}
// Exact search on IPv6.
// IpV6 addresses need to be quoted because they contain `:`
{
// IpV6 needs to be escaped because it contains `:`
let query = query_parser.parse_query("ip:\"2001:0db8:85a3:0000:0000:8a2e:0370:7334\"")?;
let count_docs = searcher.search(&*query, &Count)?;
assert_eq!(count_docs, 1);

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
// Doc 0: TermFreq 2: [0, 4]
// Doc 2: TermFreq 1: [0]
// ```
println!("Doc {doc_id}: TermFreq {term_freq}: {positions:?}");
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
doc_id = segment_postings.advance();
}
}
@@ -125,7 +125,7 @@ fn main() -> tantivy::Result<()> {
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].`
println!("Docs {docs:?}");
println!("Docs {:?}", docs);
block_segment_postings.advance();
}
}

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -20,9 +20,8 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json(
&schema,
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click",
@@ -34,8 +33,7 @@ fn main() -> tantivy::Result<()> {
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click",

View File

@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit()?
};
println!("committed with opstamp {opstamp}");
println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500));
}

View File

@@ -1,83 +0,0 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result};
use tempfile::TempDir;
fn main() -> Result<()> {
let index_path = TempDir::new()?;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
eighty-four days now without taking a fish.",
))?;
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
))?;
index_writer.commit()?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// This will match documents containing the phrase "in the"
// followed by some word starting with "su",
// i.e. it will match "in the sunlight" and "in the success",
// but not "in the Gulf Stream".
let query = query_parser.parse_query("\"in the su\"*")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let title = doc
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;
titles.sort_unstable();
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
Ok(())
}

View File

@@ -12,13 +12,12 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut tokenizer = SimpleTokenizer::default();
let mut token_stream = tokenizer.token_stream(text);
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
@@ -38,7 +37,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields
// one by one in a Document object.
@@ -83,7 +82,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?;
@@ -94,7 +93,7 @@ fn main() -> tantivy::Result<()> {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
@@ -115,8 +114,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));
}
// In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,8 +10,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::snippet::{Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -28,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// we'll only need one doc for this example.
index_writer.add_document(doc!(
@@ -55,10 +54,13 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
println!("Document score {}:", score);
println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
}

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
@@ -50,17 +50,16 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]))
.build();
]));
index.tokenizers().register("stoppy", tokenizer);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
@@ -105,9 +104,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:");
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {}:", score);
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

Some files were not shown because too many files have changed in this diff Show More