Add comments and update test

2026-01-15 17:32:56 +00:00 · 2024-09-15 08:18:46 +02:00
parent f73128fcaf
commit b71ffd7e60
3 changed files with 44 additions and 24 deletions
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -6,7 +6,7 @@
 * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
 *
 * Implements https://hal.science/hal-00465313/document
- * 
+ *
 * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
 * suited to estimating the cardinality of very large sets;  in particular, we
 * have not attempted to further optimize the implementation as described in
@@ -132,7 +132,7 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 		int64_t delta = (now - cState->regs[index][count].ts)/USECS_PER_SEC;
 		uint32_t new_histogram[HIST_SIZE] = {0};
 		for (int i = 0; i < HIST_SIZE; i++) {
-			/* Use average point of interval */
+			/* Use middle point of interval */
 			uint32 interval_log2 = pg_ceil_log2_32((delta + (HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2)) / HIST_MIN_INTERVAL);
 			uint32 cell = Min(interval_log2, HIST_SIZE-1);
 			new_histogram[cell] += cState->regs[index][count].histogram[i];
@@ -140,16 +140,26 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 		memcpy(cState->regs[index][count].histogram, new_histogram, sizeof new_histogram);
 	}
 	cState->regs[index][count].ts = now;
-	cState->regs[index][count].histogram[0] += 1;
+	cState->regs[index][count].histogram[0] += 1; // most recent access always goes to first histogram backet
 }

 static uint32_t
 getAccessCount(const HyperLogLogRegister* reg, time_t duration)
 {
 	uint32_t count = 0;
-	//for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2 <= duration; i++) {
-	for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
-		count += reg->histogram[i];
+//  Simplest solution is to take in account all points fro overlapped interval
+//	for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
+	for (size_t i = 0; i < HIST_SIZE; i++) {
+		uint32_t high_boundary = HIST_MIN_INTERVAL*(1 << i);
+		uint32_t low_boundary = HIST_MIN_INTERVAL*((1 << i)/2);
+		if (high_boundary >= duration) {
+			// Assume uniform distribution of points within interval and use proportional number of points
+			Assert(duration >= low_boundary);
+			count += reg->histogram[i] * (duration - low_boundary) / (high_boundary - low_boundary);
+			break; // it's last interval within specified time range
+		} else {
+			count += reg->histogram[i];
+		}
 	}
 	return count;
 }
@@ -159,22 +169,35 @@ getMaximum(const HyperLogLogRegister* reg, TimestampTz since, time_t duration, d
 {
 	uint8 max = 0;
 	size_t i, j;
-	uint32_t total_count = 0;
-	for (i = 0; i < HLL_C_BITS + 1; i++)
-	{
-		total_count += getAccessCount(&reg[i], duration);
-	}
-	if (total_count != 0)
+	if (min_hit_ratio == 1.0)
 	{
 		for (i = 0; i < HLL_C_BITS + 1; i++)
 		{
-			if (reg[i].ts >= since && 1.0 - (double)getAccessCount(&reg[i], duration) / total_count <= min_hit_ratio)
+			if (reg[i].ts >= since)
 			{
 				max = i;
 			}
 		}
 	}
-
+	else
+	{
+		uint32_t total_count = 0;
+		for (i = 0; i < HLL_C_BITS + 1; i++)
+		{
+			total_count += getAccessCount(&reg[i], duration);
+		}
+		if (total_count != 0)
+		{
+			for (i = 0; i < HLL_C_BITS + 1; i++)
+			{
+				// Take in account only bits with access frequncy exceeding maximal miss rate (1 - hit rate)
+				if (reg[i].ts >= since && 1.0 - (double)getAccessCount(&reg[i], duration) / total_count <= min_hit_ratio)
+				{
+					max = i;
+				}
+			}
+		}
+	}
 	return max;
 }

--- a/pgxn/neon/hll.h
+++ b/pgxn/neon/hll.h
@@ -55,8 +55,8 @@

 /*
 * Number of histogram cells. We use exponential histogram with first interval
- * equals to one minutes. Autoscaler request LFC  statistic with intervals 1,2,...,60 seconds,
- * so 1^8=64 seems to be enough for our needs.
+ * equals to one minutes. Autoscaler request LFC  statistic with intervals 1,2,...,60 minutes
+ * so 2^8=64 seems to be enough for our needs.
 */
 #define HIST_SIZE         8
 #define HIST_MIN_INTERVAL 60 /* seconds */
@@ -77,15 +77,12 @@
 * modified timestamp >= the query timestamp. This value is the number of bits
 * for this register in the normal HLL calculation.
 *
- * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
- * Usage could be halved if we decide to reduce the required time dimension
- * precision; as 32 bits in second precision should be enough for statistics.
- * However, that is not yet implemented.
+ * The memory usage is 2^B * (C + 1) * sizeof(HyperLogLogRegister), or 920kiB.
 */
 typedef struct
 {
 	TimestampTz ts; /* last access timestamp */
-	uint32_t    histogram[HIST_SIZE]; /* access counter histogram */
+	uint32_t    histogram[HIST_SIZE]; /* access counter exponential histogram */
 } HyperLogLogRegister;

 typedef struct HyperLogLogState
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -137,8 +137,8 @@ def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
    cur.execute(
        "create table t_small(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
    )
-    cur.execute("insert into t_huge(pk) values (generate_series(1,1000000))")
-    cur.execute("insert into t_small(pk) values (generate_series(1,100000))")
+    cur.execute("insert into t_huge(pk) values (generate_series(1,1000000))") # table size is 21277 pages
+    cur.execute("insert into t_small(pk) values (generate_series(1,100000))") # table size is 2128 pages
    time.sleep(2)
    before = time.monotonic()
    for _ in range(100):
@@ -152,4 +152,4 @@ def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
    optimal_cache_size = cur.fetchall()[0][0]
    log.info(f"Optimal cache size for 99% hit rate {optimal_cache_size}")
    assert ws_estimation >= 20000 and ws_estimation <= 30000
-    assert optimal_cache_size >= 2000 and optimal_cache_size <= 7000
+    assert optimal_cache_size >= 2000 and optimal_cache_size <= 3000