mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-31 23:50:41 +00:00
Switch tantivy's cardinality aggregation from the hyperloglogplus crate (HyperLogLog++ with p=16) to the official Apache DataSketches HLL implementation (datasketches crate v0.2.0 with lg_k=11, Hll4). This enables returning raw HLL sketch bytes from pomsky to Datadog's event query, where they can be properly deserialized and merged using the same DataSketches library (Java). The previous implementation required pomsky to fabricate fake HLL sketches from scalar cardinality estimates, which produced incorrect results when merged. Changes: - Cargo.toml: hyperloglogplus 0.4.1 -> datasketches 0.2.0 - CardinalityCollector: HyperLogLogPlus<u64, BuildSaltedHasher> -> HllSketch - Custom Serde impl using HllSketch binary format (cross-shard compat) - New to_sketch_bytes() for external consumers (pomsky) - Salt preserved via (salt, value) tuple hashing for column type disambiguation - Removed BuildSaltedHasher struct - Added 4 new unit tests (serde roundtrip, merge, binary compat, salt)
205 lines
5.2 KiB
TOML
205 lines
5.2 KiB
TOML
[package]
|
|
name = "tantivy"
|
|
version = "0.26.0"
|
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
|
license = "MIT"
|
|
categories = ["database-implementations", "data-structures"]
|
|
description = """Search engine library"""
|
|
documentation = "https://docs.rs/tantivy/"
|
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
|
repository = "https://github.com/quickwit-oss/tantivy"
|
|
readme = "README.md"
|
|
keywords = ["search", "information", "retrieval"]
|
|
edition = "2021"
|
|
rust-version = "1.85"
|
|
exclude = ["benches/*.json", "benches/*.txt"]
|
|
|
|
[dependencies]
|
|
oneshot = "0.1.13"
|
|
base64 = "0.22.0"
|
|
byteorder = "1.4.3"
|
|
crc32fast = "1.3.2"
|
|
once_cell = "1.10.0"
|
|
regex = { version = "1.5.5", default-features = false, features = [
|
|
"std",
|
|
"unicode",
|
|
] }
|
|
aho-corasick = "1.0"
|
|
tantivy-fst = "0.5"
|
|
memmap2 = { version = "0.9.0", optional = true }
|
|
lz4_flex = { version = "0.12", default-features = false, optional = true }
|
|
zstd = { version = "0.13", optional = true, default-features = false }
|
|
tempfile = { version = "3.12.0", optional = true }
|
|
log = "0.4.16"
|
|
serde = { version = "1.0.219", features = ["derive"] }
|
|
serde_json = "1.0.140"
|
|
fs4 = { version = "0.13.1", optional = true }
|
|
levenshtein_automata = "0.2.1"
|
|
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
|
crossbeam-channel = "0.5.4"
|
|
rust-stemmers = { version = "1.2.0", optional = true }
|
|
downcast-rs = "2.0.1"
|
|
bitpacking = { version = "0.9.3", default-features = false, features = [
|
|
"bitpacker4x",
|
|
] }
|
|
census = "0.4.2"
|
|
rustc-hash = "2.0.0"
|
|
thiserror = "2.0.1"
|
|
htmlescape = "0.3.1"
|
|
fail = { version = "0.5.0", optional = true }
|
|
time = { version = "0.3.35", features = ["serde-well-known"] }
|
|
smallvec = "1.8.0"
|
|
rayon = "1.5.2"
|
|
lru = "0.16.3"
|
|
fastdivide = "0.4.0"
|
|
itertools = "0.14.0"
|
|
measure_time = "0.9.0"
|
|
arc-swap = "1.5.0"
|
|
bon = "3.3.1"
|
|
|
|
columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
|
|
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
|
|
stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" }
|
|
query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
|
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
|
|
common = { version = "0.10", path = "./common/", package = "tantivy-common" }
|
|
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
|
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
|
datasketches = "0.2.0"
|
|
futures-util = { version = "0.3.28", optional = true }
|
|
futures-channel = { version = "0.3.28", optional = true }
|
|
fnv = "1.0.7"
|
|
typetag = "0.2.21"
|
|
|
|
[target.'cfg(windows)'.dependencies]
|
|
winapi = "0.3.9"
|
|
|
|
[dev-dependencies]
|
|
binggan = "0.14.2"
|
|
rand = "0.9"
|
|
maplit = "1.0.2"
|
|
matches = "0.1.9"
|
|
pretty_assertions = "1.2.1"
|
|
proptest = "1.7.0"
|
|
test-log = "0.2.10"
|
|
futures = "0.3.21"
|
|
paste = "1.0.11"
|
|
more-asserts = "0.3.1"
|
|
rand_distr = "0.5"
|
|
time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
|
|
postcard = { version = "1.0.4", features = [
|
|
"use-std",
|
|
], default-features = false }
|
|
|
|
[target.'cfg(not(windows))'.dev-dependencies]
|
|
criterion = { version = "0.5", default-features = false }
|
|
|
|
[dev-dependencies.fail]
|
|
version = "0.5.0"
|
|
features = ["failpoints"]
|
|
|
|
[profile.release]
|
|
opt-level = 3
|
|
debug = false
|
|
debug-assertions = false
|
|
|
|
[profile.bench]
|
|
opt-level = 3
|
|
debug = true
|
|
debug-assertions = false
|
|
|
|
[profile.test]
|
|
debug-assertions = true
|
|
overflow-checks = true
|
|
|
|
[features]
|
|
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression", "stemmer"]
|
|
stemmer = ["rust-stemmers"]
|
|
mmap = ["fs4", "tempfile", "memmap2"]
|
|
stopwords = []
|
|
|
|
lz4-compression = ["lz4_flex"]
|
|
zstd-compression = ["zstd"]
|
|
|
|
# enable zstd-compression in columnar (and sstable)
|
|
columnar-zstd-compression = ["columnar/zstd-compression"]
|
|
|
|
failpoints = ["fail", "fail/failpoints"]
|
|
unstable = [] # useful for benches.
|
|
|
|
quickwit = ["sstable", "futures-util", "futures-channel"]
|
|
|
|
# Compares only the hash of a string when indexing data.
|
|
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
|
|
# Uses 64bit ahash.
|
|
compare_hash_only = ["stacker/compare_hash_only"]
|
|
|
|
[workspace]
|
|
members = [
|
|
"query-grammar",
|
|
"bitpacker",
|
|
"common",
|
|
"ownedbytes",
|
|
"stacker",
|
|
"sstable",
|
|
"tokenizer-api",
|
|
"columnar",
|
|
]
|
|
|
|
# Following the "fail" crate best practises, we isolate
|
|
# tests that define specific behavior in fail check points
|
|
# in a different binary.
|
|
#
|
|
# We do that because, fail rely on a global definition of
|
|
# failpoints behavior and hence, it is incompatible with
|
|
# multithreading.
|
|
[[test]]
|
|
name = "failpoints"
|
|
path = "tests/failpoints/mod.rs"
|
|
required-features = ["failpoints"]
|
|
|
|
[[bench]]
|
|
name = "analyzer"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "index-bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "agg_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "exists_json"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "range_query"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "and_or_queries"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "range_queries"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "bool_queries_with_range"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "str_search_and_get"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "merge_segments"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "regex_all_terms"
|
|
harness = false
|
|
|