diff --git a/src/aggregation/metric/stats.rs b/src/aggregation/metric/stats.rs index 304884b14..6afea5eb8 100644 --- a/src/aggregation/metric/stats.rs +++ b/src/aggregation/metric/stats.rs @@ -287,6 +287,161 @@ impl SegmentAggregationCollector for SegmentStatsCollector { } } + +pub struct ExtendedStats { + /// The number of documents. + pub count: u64, + /// The sum of the fast field values. + pub sum: f64, + /// The min value of the fast field values. + pub min: Option, + /// The max value of the fast field values. + pub max: Option, + /// The average of the fast field values. `None` if count equals zero. + pub avg: Option, + /// The sum of squares of the fast field values. `None` if count equals zero. + pub sum_of_squares: Option, + /// The variance of the fast field values. `None` if count is less then 2. + pub variance: Option, + /// The variance population of the fast field values, always equal to variance. `None` if count is less then 2. + pub variance_population: Option, + /// The variance sampling of the fast field values, always equal to variance. `None` if count is less then 2. + pub variance_sampling: Option, + /// The standard deviation of the fast field values. `None` if count is less then 2. + pub standard_deviation: Option, + /// The standard deviation of the fast field values, always equal to variance. `None` if count is less then 2. + pub standard_deviation_population: Option, + /// The standard deviation sampling of the fast field values, always equal to variance. `None` if count is less then 2. + pub standard_deviation_sampling: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IntermediateExtendedStats { + /// The number of extracted values. + count: u64, + /// The sum of the extracted values. + sum: f64, + /// The min value. + min: f64, + /// The max value. + max: f64, + // The sum of the square values it's referred as M2 in Welford's online algorithm + sum_of_squares: f64, + // The mean an intermediate value need for calculating the variance + // as per [Welford's online algorithm](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm) + mean: f64, +} + +impl Default for IntermediateExtendedStats { + fn default() -> Self { + Self { + count: 0, + sum: 0.0, + min: f64::MAX, + max: f64::MIN, + sum_of_squares: 0.0, + mean: 0.0, + } + } +} + +impl IntermediateExtendedStats { + /// Merges the other stats intermediate result into self. + pub fn merge_fruits(&mut self, other: IntermediateExtendedStats) { + + self.min = self.min.min(other.min); + self.max = self.max.max(other.max); + + + if other.count!=0 { + if self.count==0 { + self.sum_of_squares=other.sum_of_squares; + self.count=other.count; + self.mean=other.mean; + } else { + let new_count=self.count+other.count; + let delta = other.sum/other.count as f64 - self.sum/self.count as f64; + self.sum_of_squares += other.sum_of_squares + delta * delta * self.count as f64 * other.count as f64/new_count as f64; + self.count =new_count; + //self.mean=self.mean + delta*other.count as f64/new_count as f64; + self.mean=(self.sum as f64 + other.sum as f64)/new_count as f64; + + } + self.sum += other.sum; + } + + } + + + + /// Computes the final stats value. + pub fn finalize(&self) -> ExtendedStats { + let min = if self.count == 0 { + None + } else { + Some(self.min) + }; + let max = if self.count == 0 { + None + } else { + Some(self.max) + }; + let avg = if self.count == 0 { + None + } else { + Some(self.mean) + }; + let sum_of_squares = if self.count == 0 { + None + } else { + Some(self.sum_of_squares) + }; + let variance = if self.count <= 1 { + None + } else { + Some(self.sum_of_squares/self.count as f64) + }; + let variance_sampling = if self.count <= 1 { + None + } else { + Some(self.sum_of_squares/(self.count-1) as f64) + }; + let standard_deviation = variance.map(|v| v.sqrt()); + let standard_deviation_sampling = variance_sampling.map(|v| v.sqrt()); + + ExtendedStats { + count: self.count, + sum: self.sum, + min, + max, + avg, + sum_of_squares, + variance, + variance_population: variance, + variance_sampling, + standard_deviation, + standard_deviation_population: standard_deviation, + standard_deviation_sampling + } + } + + fn collect(&mut self, value: f64) { + self.count += 1; + self.sum += value; + self.min = self.min.min(value); + self.max = self.max.max(value); + self.update_variance(value); + } + + fn update_variance(&mut self, value: f64) { + let delta = value - self.mean; + self.mean += delta / self.count as f64; + let delta2 = value - self.mean; + self.sum_of_squares += delta * delta2; + } +} + + #[cfg(test)] mod tests { @@ -302,6 +457,8 @@ mod tests { use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::{Index, IndexWriter, Term}; + use crate::aggregation::metric::IntermediateExtendedStats; + #[test] fn test_aggregation_stats_empty_index() -> crate::Result<()> { // test index without segments @@ -642,4 +799,234 @@ mod tests { Ok(()) } + + + #[test] + fn extended_stat_zero_value() { + let intermediate_extend_stats= IntermediateExtendedStats::default(); + let extended_stats=intermediate_extend_stats.finalize(); + assert!(extended_stats.variance.is_none()); + assert!(extended_stats.variance_population.is_none()); + assert!(extended_stats.variance_sampling.is_none()); + assert!(extended_stats.sum_of_squares.is_none()); + assert!(extended_stats.standard_deviation.is_none()); + assert!(extended_stats.standard_deviation_population.is_none()); + assert!(extended_stats.standard_deviation_sampling.is_none()); + } + + #[test] + fn extended_stat_one_value() { + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.collect(1.0f64); + let extended_stats=intermediate_extend_stats.finalize(); + assert!(extended_stats.variance.is_none()); + assert!(extended_stats.variance_population.is_none()); + assert!(extended_stats.variance_sampling.is_none()); + assert!(extended_stats.standard_deviation.is_none()); + assert!(extended_stats.standard_deviation_population.is_none()); + assert!(extended_stats.standard_deviation_sampling.is_none()); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(0.0f64,sum_of_squares); + } + + #[test] + fn extended_stat_multiple_values() { + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.collect(1.0f64); + intermediate_extend_stats.collect(2.0f64); + intermediate_extend_stats.collect(3.0f64); + intermediate_extend_stats.collect(4.0f64); + intermediate_extend_stats.collect(5.0f64); + let extended_stats=intermediate_extend_stats.finalize(); + let variance=extended_stats.variance.unwrap(); + assert_eq!(2.0f64,variance); + let variance_population=extended_stats.variance_population.unwrap(); + assert_eq!(2.0f64,variance_population); + let variance_sampling=extended_stats.variance_sampling.unwrap(); + assert_eq!(2.5f64,variance_sampling); + let standard_deviation=extended_stats.standard_deviation.unwrap(); + assert_eq!(2.0f64.sqrt(),standard_deviation); + let standard_deviation_population=extended_stats.standard_deviation_population.unwrap(); + assert_eq!(2.0f64.sqrt(),standard_deviation_population); + let standard_deviation_sampling=extended_stats.standard_deviation_sampling.unwrap(); + assert_eq!(2.5f64.sqrt(),standard_deviation_sampling); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(10f64,sum_of_squares); + let avg=extended_stats.avg.unwrap(); + assert_eq!(3.0f64,avg); + + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.collect(1.0f64); + intermediate_extend_stats.collect(3.0f64); + intermediate_extend_stats.collect(4.0f64); + intermediate_extend_stats.collect(5.0f64); + intermediate_extend_stats.collect(8.0f64); + intermediate_extend_stats.collect(10.0f64); + let extended_stats=intermediate_extend_stats.finalize(); + let variance=extended_stats.variance.unwrap(); + assert_eq!(9.138888888888888f64,variance); + let variance_population=extended_stats.variance_population.unwrap(); + assert_eq!(9.138888888888888f64,variance_population); + let variance_sampling=extended_stats.variance_sampling.unwrap(); + assert_eq!(10.966666666666665f64,variance_sampling); + let standard_deviation=extended_stats.standard_deviation.unwrap(); + assert_eq!(9.138888888888888f64.sqrt(),standard_deviation); + let standard_deviation_population=extended_stats.standard_deviation_population.unwrap(); + assert_eq!(9.138888888888888f64.sqrt(),standard_deviation_population); + let standard_deviation_sampling=extended_stats.standard_deviation_sampling.unwrap(); + assert_eq!(10.966666666666665f64.sqrt(),standard_deviation_sampling); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(54.83333333333333f64,sum_of_squares); + let avg=extended_stats.avg.unwrap(); + assert_eq!(5.166666666666667,avg); + } + + #[test] + fn merge_empty_with_one_value() { + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + let mut intermediate_extend_stats1= IntermediateExtendedStats::default(); + intermediate_extend_stats1.collect(1.0f64); + intermediate_extend_stats.merge_fruits(intermediate_extend_stats1); + let extended_stats=intermediate_extend_stats.finalize(); + assert!(extended_stats.variance.is_none()); + assert!(extended_stats.variance_population.is_none()); + assert!(extended_stats.variance_sampling.is_none()); + assert!(extended_stats.standard_deviation.is_none()); + assert!(extended_stats.standard_deviation_population.is_none()); + assert!(extended_stats.standard_deviation_sampling.is_none()); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(0.0f64,sum_of_squares); + } + + #[test] + fn merge_empty_with_multiple_values() { + + let mut intermediate_extend_stats1= IntermediateExtendedStats::default(); + intermediate_extend_stats1.collect(1.0f64); + intermediate_extend_stats1.collect(2.0f64); + intermediate_extend_stats1.collect(3.0f64); + intermediate_extend_stats1.collect(4.0f64); + intermediate_extend_stats1.collect(5.0f64); + + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.merge_fruits(intermediate_extend_stats1); + let extended_stats=intermediate_extend_stats.finalize(); + + let variance=extended_stats.variance.unwrap(); + assert_eq!(2.0f64,variance); + let variance_population=extended_stats.variance_population.unwrap(); + assert_eq!(2.0f64,variance_population); + let variance_sampling=extended_stats.variance_sampling.unwrap(); + assert_eq!(2.5f64,variance_sampling); + let standard_deviation=extended_stats.standard_deviation.unwrap(); + assert_eq!(2.0f64.sqrt(),standard_deviation); + let standard_deviation_population=extended_stats.standard_deviation_population.unwrap(); + assert_eq!(2.0f64.sqrt(),standard_deviation_population); + let standard_deviation_sampling=extended_stats.standard_deviation_sampling.unwrap(); + assert_eq!(2.5f64.sqrt(),standard_deviation_sampling); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(10f64,sum_of_squares); + } + + #[test] + fn merge_non_empty_extended_stats() { + + let mut intermediate_extend_stats1= IntermediateExtendedStats::default(); + intermediate_extend_stats1.collect(3.0f64); + intermediate_extend_stats1.collect(4.0f64); + intermediate_extend_stats1.collect(5.0f64); + + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.collect(1.0f64); + intermediate_extend_stats.collect(2.0f64); + intermediate_extend_stats.merge_fruits(intermediate_extend_stats1); + let extended_stats=intermediate_extend_stats.finalize(); + + let variance=extended_stats.variance.unwrap(); + assert_eq!(2.0f64,variance); + let variance_population=extended_stats.variance_population.unwrap(); + assert_eq!(2.0f64,variance_population); + let variance_sampling=extended_stats.variance_sampling.unwrap(); + assert_eq!(2.5f64,variance_sampling); + let standard_deviation=extended_stats.standard_deviation.unwrap(); + assert_eq!(2.0f64.sqrt(),standard_deviation); + let standard_deviation_population=extended_stats.standard_deviation_population.unwrap(); + assert_eq!(2.0f64.sqrt(),standard_deviation_population); + let standard_deviation_sampling=extended_stats.standard_deviation_sampling.unwrap(); + assert_eq!(2.5f64.sqrt(),standard_deviation_sampling); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(10f64,sum_of_squares); + + + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.collect(1.0f64); + intermediate_extend_stats.collect(3.0f64); + intermediate_extend_stats.collect(4.0f64); + let mut intermediate_extend_stats1= IntermediateExtendedStats::default(); + intermediate_extend_stats1.collect(5.0f64); + intermediate_extend_stats1.collect(8.0f64); + intermediate_extend_stats1.collect(10.0f64); + intermediate_extend_stats.merge_fruits(intermediate_extend_stats1); + let extended_stats=intermediate_extend_stats.finalize(); + let variance=extended_stats.variance.unwrap(); + assert_eq!(9.138888888888888f64,variance); + let variance_population=extended_stats.variance_population.unwrap(); + assert_eq!(9.138888888888888f64,variance_population); + let variance_sampling=extended_stats.variance_sampling.unwrap(); + assert_eq!(10.966666666666665f64,variance_sampling); + let standard_deviation=extended_stats.standard_deviation.unwrap(); + assert_eq!(9.138888888888888f64.sqrt(),standard_deviation); + let standard_deviation_population=extended_stats.standard_deviation_population.unwrap(); + assert_eq!(9.138888888888888f64.sqrt(),standard_deviation_population); + let standard_deviation_sampling=extended_stats.standard_deviation_sampling.unwrap(); + assert_eq!(10.966666666666665f64.sqrt(),standard_deviation_sampling); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(54.83333333333333f64,sum_of_squares); + let avg=extended_stats.avg.unwrap(); + assert_eq!(5.166666666666667,avg); + } + + + fn round_f64(value: f64, digits: u32) -> f64 { + let y = 10u64.pow(digits) as f64; + (value * y).round() / y + } + + #[test] + fn test_round() { + assert_eq!(round_f64(4.365, 2), 4.37); + assert_eq!(round_f64(9.138888888888888,12),9.138888888889); + } + + #[test] + fn merge_and_then_collect_non_empty_extended_stats() { + + let mut intermediate_extend_stats= IntermediateExtendedStats::default(); + intermediate_extend_stats.collect(1.0f64); + intermediate_extend_stats.collect(3.0f64); + + let mut intermediate_extend_stats1= IntermediateExtendedStats::default(); + intermediate_extend_stats1.collect(5.0f64); + intermediate_extend_stats1.collect(8.0f64); + intermediate_extend_stats1.collect(10.0f64); + intermediate_extend_stats.merge_fruits(intermediate_extend_stats1); + intermediate_extend_stats.collect(4.0f64); + let extended_stats=intermediate_extend_stats.finalize(); + let variance=extended_stats.variance.unwrap(); + assert_eq!(round_f64(9.138888888888888,12),round_f64(variance,12)); + let variance_population=extended_stats.variance_population.unwrap(); + assert_eq!(round_f64(9.138888888888888,12),round_f64(variance_population,12)); + let variance_sampling=extended_stats.variance_sampling.unwrap(); + assert_eq!(round_f64(10.966666666666665,12),round_f64(variance_sampling,12)); + let standard_deviation=extended_stats.standard_deviation.unwrap(); + assert_eq!(round_f64(9.138888888888888_f64.sqrt(),12),round_f64(standard_deviation,12)); + let standard_deviation_population=extended_stats.standard_deviation_population.unwrap(); + assert_eq!(round_f64(9.138888888888888_f64.sqrt(),12),round_f64(standard_deviation_population,12)); + let standard_deviation_sampling=extended_stats.standard_deviation_sampling.unwrap(); + assert_eq!(round_f64(10.966666666666665_f64.sqrt(),12),round_f64(standard_deviation_sampling,12)); + let sum_of_squares=extended_stats.sum_of_squares.unwrap(); + assert_eq!(round_f64(54.83333333333333f64,12),round_f64(sum_of_squares,12)); + let avg=extended_stats.avg.unwrap(); + assert_eq!(5.166666666666667,avg); + } }