Files
tantivy/Cargo.toml
PSeitz-dd 203751f2fe Optimize ExistsQuery for a high number of dynamic columns (#2694)
* Optimize ExistsQuery for a high number of dynamic columns

The previous algorithm checked _each_ doc in _each_ column for
existence. This causes huge cost on JSON fields with e.g. 100k columns.
Compute a bitset instead if we have more than one column.

add `iter_docs` to the multivalued_index

* add benchmark

subfields=1
exists_json_union    Memory: 89.3 KB (+2.01%)    Avg: 0.4865ms (-26.03%)    Median: 0.4865ms (-26.03%)    [0.4865ms .. 0.4865ms]
subfields=2
exists_json_union    Memory: 68.1 KB     Avg: 1.7048ms (-0.46%)    Median: 1.7048ms (-0.46%)    [1.7048ms .. 1.7048ms]
subfields=3
exists_json_union    Memory: 61.8 KB     Avg: 2.0742ms (-2.22%)    Median: 2.0742ms (-2.22%)    [2.0742ms .. 2.0742ms]
subfields=4
exists_json_union    Memory: 119.8 KB (+103.44%)    Avg: 3.9500ms (+42.62%)    Median: 3.9500ms (+42.62%)    [3.9500ms .. 3.9500ms]
subfields=5
exists_json_union    Memory: 120.4 KB (+107.65%)    Avg: 3.9610ms (+20.65%)    Median: 3.9610ms (+20.65%)    [3.9610ms .. 3.9610ms]
subfields=6
exists_json_union    Memory: 120.6 KB (+107.49%)    Avg: 3.8903ms (+3.11%)    Median: 3.8903ms (+3.11%)    [3.8903ms .. 3.8903ms]
subfields=7
exists_json_union    Memory: 120.9 KB (+106.93%)    Avg: 3.6220ms (-16.22%)    Median: 3.6220ms (-16.22%)    [3.6220ms .. 3.6220ms]
subfields=8
exists_json_union    Memory: 121.3 KB (+106.23%)    Avg: 4.0981ms (-15.97%)    Median: 4.0981ms (-15.97%)    [4.0981ms .. 4.0981ms]
subfields=16
exists_json_union    Memory: 123.1 KB (+103.09%)    Avg: 4.3483ms (-92.26%)    Median: 4.3483ms (-92.26%)    [4.3483ms .. 4.3483ms]
subfields=256
exists_json_union    Memory: 204.6 KB (+19.85%)    Avg: 3.8874ms (-99.01%)    Median: 3.8874ms (-99.01%)    [3.8874ms .. 3.8874ms]
subfields=4096
exists_json_union    Memory: 2.0 MB     Avg: 3.5571ms (-99.90%)    Median: 3.5571ms (-99.90%)    [3.5571ms .. 3.5571ms]
subfields=65536
exists_json_union    Memory: 28.3 MB     Avg: 14.4417ms (-99.97%)    Median: 14.4417ms (-99.97%)    [14.4417ms .. 14.4417ms]
subfields=262144
exists_json_union    Memory: 113.3 MB     Avg: 66.2860ms (-99.95%)    Median: 66.2860ms (-99.95%)    [66.2860ms .. 66.2860ms]

* rename methods
2025-09-16 18:21:03 +02:00

174 lines
4.8 KiB
TOML

[package]
name = "tantivy"
version = "0.25.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Search engine library"""
documentation = "https://docs.rs/tantivy/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.85"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
oneshot = "0.1.7"
base64 = "0.22.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = [
"std",
"unicode",
] }
aho-corasick = "1.0"
tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
tempfile = { version = "3.12.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.140"
fs4 = { version = "0.13.1", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "2.0.1"
bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x",
] }
census = "0.4.2"
rustc-hash = "2.0.0"
thiserror = "2.0.1"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
time = { version = "0.3.35", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
fastdivide = "0.4.0"
itertools = "0.14.0"
measure_time = "0.9.0"
arc-swap = "1.5.0"
bon = "3.3.1"
columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" }
query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
common = { version = "0.10", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
futures-util = { version = "0.3.28", optional = true }
futures-channel = { version = "0.3.28", optional = true }
fnv = "1.0.7"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
[dev-dependencies]
binggan = "0.14.0"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
test-log = "0.2.10"
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.4.3"
time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [
"use-std",
], default-features = false }
[target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false }
[dev-dependencies.fail]
version = "0.5.0"
features = ["failpoints"]
[profile.release]
opt-level = 3
debug = false
debug-assertions = false
[profile.bench]
opt-level = 3
debug = true
debug-assertions = false
[profile.test]
debug-assertions = true
overflow-checks = true
[features]
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []
lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"]
# enable zstd-compression in columnar (and sstable)
columnar-zstd-compression = ["columnar/zstd-compression"]
failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util", "futures-channel"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
[workspace]
members = [
"query-grammar",
"bitpacker",
"common",
"ownedbytes",
"stacker",
"sstable",
"tokenizer-api",
"columnar",
]
# Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points
# in a different binary.
#
# We do that because, fail rely on a global definition of
# failpoints behavior and hence, it is incompatible with
# multithreading.
[[test]]
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["failpoints"]
[[bench]]
name = "analyzer"
harness = false
[[bench]]
name = "index-bench"
harness = false
[[bench]]
name = "agg_bench"
harness = false
[[bench]]
name = "exists_json"
harness = false