From 58aa4b707433a379b0ae716f29b0feff9c2479fc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 13 Apr 2026 19:14:30 +0200 Subject: [PATCH] Fix cardinality aggregation using invalid coupons (#2893) Previously, coupons were computed via murmurhash32 and fed as raw u32 to the HLL sketch, bypassing the sketch's internal hashing and producing invalid (slot, value) pairs. Switch to Coupon::from_hash from the datasketches crate which correctly derives coupons, and drop the now-unused murmurhash32 dependency. Co-authored-by: Claude Opus 4.6 (1M context) --- Cargo.toml | 3 +-- src/aggregation/metric/cardinality.rs | 20 ++++++++------------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7d2ed19f8..fb27391ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ regex = { version = "1.5.5", default-features = false, features = [ "std", "unicode", ] } -murmurhash32 = "0.3" aho-corasick = "1.0" tantivy-fst = "0.5" memmap2 = { version = "0.9.0", optional = true } @@ -66,7 +65,7 @@ tantivy-bitpacker = { version = "0.10", path = "./bitpacker" } common = { version = "0.11", path = "./common/", package = "tantivy-common" } tokenizer-api = { version = "0.7", path = "./tokenizer-api", package = "tantivy-tokenizer-api" } sketches-ddsketch = { version = "0.4", features = ["use_serde"] } -datasketches = { git = "https://github.com/fulmicoton-dd/datasketches-rust", rev = "eb4ad64" } +datasketches = { git = "https://github.com/fulmicoton-dd/datasketches-rust", rev = "7635fb8" } futures-util = { version = "0.3.28", optional = true } futures-channel = { version = "0.3.28", optional = true } fnv = "1.0.7" diff --git a/src/aggregation/metric/cardinality.rs b/src/aggregation/metric/cardinality.rs index 32aa746f8..e68cd6e48 100644 --- a/src/aggregation/metric/cardinality.rs +++ b/src/aggregation/metric/cardinality.rs @@ -4,7 +4,7 @@ use std::io; use columnar::column_values::CompactSpaceU64Accessor; use columnar::{Column, ColumnType, Dictionary, StrColumn}; -use datasketches::hll::{HllSketch, HllType, HllUnion}; +use datasketches::hll::{Coupon, HllSketch, HllType, HllUnion}; use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -121,21 +121,17 @@ impl CardinalityAggregationReq { } } -/// A coupon is the hash used to represent our elements in our cardinality sketch. -/// TODO switch to u64, but this requires updating the lib upstream. -type Coupon = u32; - /// A CouponCache is here to cache the mapping term ordinal -> coupon (see above). /// The idea is that we do not want to fetch terms associated to several term ordinals, /// several times due to the fact that we have several buckets. enum CouponCache { Dense { coupon_map: Vec, - missing_coupon_opt: Option, + missing_coupon_opt: Option, }, Sparse { coupon_map: FxHashMap, - missing_coupon_opt: Option, + missing_coupon_opt: Option, }, } @@ -163,7 +159,7 @@ impl CouponCache { let should_use_dense = highest_term_ord < 1_000_000u64 || highest_term_ord < num_terms as u64 * 3u64; if should_use_dense { - let mut coupon_map: Vec = vec![0; highest_term_ord as usize + 1]; + let mut coupon_map: Vec = vec![Coupon::EMPTY; highest_term_ord as usize + 1]; for (term_ord, coupon) in term_ords.into_iter().zip(coupons.into_iter()) { coupon_map[term_ord as usize] = coupon; } @@ -172,7 +168,7 @@ impl CouponCache { missing_coupon_opt, } } else { - let coupon_map: FxHashMap = term_ords.into_iter().zip(coupons).collect(); + let coupon_map: FxHashMap = term_ords.into_iter().zip(coupons).collect(); CouponCache::Sparse { coupon_map, missing_coupon_opt, @@ -266,7 +262,7 @@ fn build_coupon_cache( let mut coupons: Vec = Vec::with_capacity(term_ords.len()); let all_term_ords_found: bool = dictionary.sorted_ords_to_term_cb(&term_ords, |term_bytes| { - let coupon: Coupon = murmurhash32::murmurhash2(term_bytes); + let coupon: Coupon = Coupon::from_hash(term_bytes); coupons.push(coupon); })?; assert!(all_term_ords_found); @@ -275,14 +271,14 @@ fn build_coupon_cache( // we populate the cache with the missing key too (if any). let missing_coupon_opt: Option = missing_value_opt.map(|missing_key| { if let Key::Str(missing_value_str) = missing_key { - murmurhash32::murmurhash2(missing_value_str.as_bytes()) + Coupon::from_hash(missing_value_str.as_bytes()) } else { // See https://github.com/quickwit-oss/tantivy/issues/2891 // A missing key with a type different from Str will not work as intended // for the moment. // // Right now this is just a partial workaround. - 35679954u32 + Coupon::from_hash("__tantivy_missing_non_str__".as_bytes()) } }); Ok(CouponCache::new(term_ords, coupons, missing_coupon_opt))