mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
* Optimize ExistsQuery for a high number of dynamic columns The previous algorithm checked _each_ doc in _each_ column for existence. This causes huge cost on JSON fields with e.g. 100k columns. Compute a bitset instead if we have more than one column. add `iter_docs` to the multivalued_index * add benchmark subfields=1 exists_json_union Memory: 89.3 KB (+2.01%) Avg: 0.4865ms (-26.03%) Median: 0.4865ms (-26.03%) [0.4865ms .. 0.4865ms] subfields=2 exists_json_union Memory: 68.1 KB Avg: 1.7048ms (-0.46%) Median: 1.7048ms (-0.46%) [1.7048ms .. 1.7048ms] subfields=3 exists_json_union Memory: 61.8 KB Avg: 2.0742ms (-2.22%) Median: 2.0742ms (-2.22%) [2.0742ms .. 2.0742ms] subfields=4 exists_json_union Memory: 119.8 KB (+103.44%) Avg: 3.9500ms (+42.62%) Median: 3.9500ms (+42.62%) [3.9500ms .. 3.9500ms] subfields=5 exists_json_union Memory: 120.4 KB (+107.65%) Avg: 3.9610ms (+20.65%) Median: 3.9610ms (+20.65%) [3.9610ms .. 3.9610ms] subfields=6 exists_json_union Memory: 120.6 KB (+107.49%) Avg: 3.8903ms (+3.11%) Median: 3.8903ms (+3.11%) [3.8903ms .. 3.8903ms] subfields=7 exists_json_union Memory: 120.9 KB (+106.93%) Avg: 3.6220ms (-16.22%) Median: 3.6220ms (-16.22%) [3.6220ms .. 3.6220ms] subfields=8 exists_json_union Memory: 121.3 KB (+106.23%) Avg: 4.0981ms (-15.97%) Median: 4.0981ms (-15.97%) [4.0981ms .. 4.0981ms] subfields=16 exists_json_union Memory: 123.1 KB (+103.09%) Avg: 4.3483ms (-92.26%) Median: 4.3483ms (-92.26%) [4.3483ms .. 4.3483ms] subfields=256 exists_json_union Memory: 204.6 KB (+19.85%) Avg: 3.8874ms (-99.01%) Median: 3.8874ms (-99.01%) [3.8874ms .. 3.8874ms] subfields=4096 exists_json_union Memory: 2.0 MB Avg: 3.5571ms (-99.90%) Median: 3.5571ms (-99.90%) [3.5571ms .. 3.5571ms] subfields=65536 exists_json_union Memory: 28.3 MB Avg: 14.4417ms (-99.97%) Median: 14.4417ms (-99.97%) [14.4417ms .. 14.4417ms] subfields=262144 exists_json_union Memory: 113.3 MB Avg: 66.2860ms (-99.95%) Median: 66.2860ms (-99.95%) [66.2860ms .. 66.2860ms] * rename methods
174 lines
4.8 KiB
TOML
174 lines
4.8 KiB
TOML
[package]
|
|
name = "tantivy"
|
|
version = "0.25.0"
|
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
|
license = "MIT"
|
|
categories = ["database-implementations", "data-structures"]
|
|
description = """Search engine library"""
|
|
documentation = "https://docs.rs/tantivy/"
|
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
|
repository = "https://github.com/quickwit-oss/tantivy"
|
|
readme = "README.md"
|
|
keywords = ["search", "information", "retrieval"]
|
|
edition = "2021"
|
|
rust-version = "1.85"
|
|
exclude = ["benches/*.json", "benches/*.txt"]
|
|
|
|
[dependencies]
|
|
oneshot = "0.1.7"
|
|
base64 = "0.22.0"
|
|
byteorder = "1.4.3"
|
|
crc32fast = "1.3.2"
|
|
once_cell = "1.10.0"
|
|
regex = { version = "1.5.5", default-features = false, features = [
|
|
"std",
|
|
"unicode",
|
|
] }
|
|
aho-corasick = "1.0"
|
|
tantivy-fst = "0.5"
|
|
memmap2 = { version = "0.9.0", optional = true }
|
|
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
|
zstd = { version = "0.13", optional = true, default-features = false }
|
|
tempfile = { version = "3.12.0", optional = true }
|
|
log = "0.4.16"
|
|
serde = { version = "1.0.219", features = ["derive"] }
|
|
serde_json = "1.0.140"
|
|
fs4 = { version = "0.13.1", optional = true }
|
|
levenshtein_automata = "0.2.1"
|
|
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
|
crossbeam-channel = "0.5.4"
|
|
rust-stemmers = "1.2.0"
|
|
downcast-rs = "2.0.1"
|
|
bitpacking = { version = "0.9.2", default-features = false, features = [
|
|
"bitpacker4x",
|
|
] }
|
|
census = "0.4.2"
|
|
rustc-hash = "2.0.0"
|
|
thiserror = "2.0.1"
|
|
htmlescape = "0.3.1"
|
|
fail = { version = "0.5.0", optional = true }
|
|
time = { version = "0.3.35", features = ["serde-well-known"] }
|
|
smallvec = "1.8.0"
|
|
rayon = "1.5.2"
|
|
lru = "0.12.0"
|
|
fastdivide = "0.4.0"
|
|
itertools = "0.14.0"
|
|
measure_time = "0.9.0"
|
|
arc-swap = "1.5.0"
|
|
bon = "3.3.1"
|
|
|
|
columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
|
|
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
|
|
stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" }
|
|
query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
|
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
|
|
common = { version = "0.10", path = "./common/", package = "tantivy-common" }
|
|
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
|
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
|
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
|
futures-util = { version = "0.3.28", optional = true }
|
|
futures-channel = { version = "0.3.28", optional = true }
|
|
fnv = "1.0.7"
|
|
|
|
[target.'cfg(windows)'.dependencies]
|
|
winapi = "0.3.9"
|
|
|
|
[dev-dependencies]
|
|
binggan = "0.14.0"
|
|
rand = "0.8.5"
|
|
maplit = "1.0.2"
|
|
matches = "0.1.9"
|
|
pretty_assertions = "1.2.1"
|
|
proptest = "1.0.0"
|
|
test-log = "0.2.10"
|
|
futures = "0.3.21"
|
|
paste = "1.0.11"
|
|
more-asserts = "0.3.1"
|
|
rand_distr = "0.4.3"
|
|
time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
|
|
postcard = { version = "1.0.4", features = [
|
|
"use-std",
|
|
], default-features = false }
|
|
|
|
[target.'cfg(not(windows))'.dev-dependencies]
|
|
criterion = { version = "0.5", default-features = false }
|
|
|
|
[dev-dependencies.fail]
|
|
version = "0.5.0"
|
|
features = ["failpoints"]
|
|
|
|
[profile.release]
|
|
opt-level = 3
|
|
debug = false
|
|
debug-assertions = false
|
|
|
|
[profile.bench]
|
|
opt-level = 3
|
|
debug = true
|
|
debug-assertions = false
|
|
|
|
[profile.test]
|
|
debug-assertions = true
|
|
overflow-checks = true
|
|
|
|
[features]
|
|
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
|
|
mmap = ["fs4", "tempfile", "memmap2"]
|
|
stopwords = []
|
|
|
|
lz4-compression = ["lz4_flex"]
|
|
zstd-compression = ["zstd"]
|
|
|
|
# enable zstd-compression in columnar (and sstable)
|
|
columnar-zstd-compression = ["columnar/zstd-compression"]
|
|
|
|
failpoints = ["fail", "fail/failpoints"]
|
|
unstable = [] # useful for benches.
|
|
|
|
quickwit = ["sstable", "futures-util", "futures-channel"]
|
|
|
|
# Compares only the hash of a string when indexing data.
|
|
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
|
|
# Uses 64bit ahash.
|
|
compare_hash_only = ["stacker/compare_hash_only"]
|
|
|
|
[workspace]
|
|
members = [
|
|
"query-grammar",
|
|
"bitpacker",
|
|
"common",
|
|
"ownedbytes",
|
|
"stacker",
|
|
"sstable",
|
|
"tokenizer-api",
|
|
"columnar",
|
|
]
|
|
|
|
# Following the "fail" crate best practises, we isolate
|
|
# tests that define specific behavior in fail check points
|
|
# in a different binary.
|
|
#
|
|
# We do that because, fail rely on a global definition of
|
|
# failpoints behavior and hence, it is incompatible with
|
|
# multithreading.
|
|
[[test]]
|
|
name = "failpoints"
|
|
path = "tests/failpoints/mod.rs"
|
|
required-features = ["failpoints"]
|
|
|
|
[[bench]]
|
|
name = "analyzer"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "index-bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "agg_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "exists_json"
|
|
harness = false
|