Add comments and update test

This commit is contained in:
Konstantin Knizhnik
2024-09-15 08:18:46 +02:00
parent f73128fcaf
commit b71ffd7e60
3 changed files with 44 additions and 24 deletions

View File

@@ -6,7 +6,7 @@
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
*
* Implements https://hal.science/hal-00465313/document
*
*
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
* suited to estimating the cardinality of very large sets; in particular, we
* have not attempted to further optimize the implementation as described in
@@ -132,7 +132,7 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
int64_t delta = (now - cState->regs[index][count].ts)/USECS_PER_SEC;
uint32_t new_histogram[HIST_SIZE] = {0};
for (int i = 0; i < HIST_SIZE; i++) {
/* Use average point of interval */
/* Use middle point of interval */
uint32 interval_log2 = pg_ceil_log2_32((delta + (HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2)) / HIST_MIN_INTERVAL);
uint32 cell = Min(interval_log2, HIST_SIZE-1);
new_histogram[cell] += cState->regs[index][count].histogram[i];
@@ -140,16 +140,26 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
memcpy(cState->regs[index][count].histogram, new_histogram, sizeof new_histogram);
}
cState->regs[index][count].ts = now;
cState->regs[index][count].histogram[0] += 1;
cState->regs[index][count].histogram[0] += 1; // most recent access always goes to first histogram backet
}
static uint32_t
getAccessCount(const HyperLogLogRegister* reg, time_t duration)
{
uint32_t count = 0;
//for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2 <= duration; i++) {
for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
count += reg->histogram[i];
// Simplest solution is to take in account all points fro overlapped interval
// for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
for (size_t i = 0; i < HIST_SIZE; i++) {
uint32_t high_boundary = HIST_MIN_INTERVAL*(1 << i);
uint32_t low_boundary = HIST_MIN_INTERVAL*((1 << i)/2);
if (high_boundary >= duration) {
// Assume uniform distribution of points within interval and use proportional number of points
Assert(duration >= low_boundary);
count += reg->histogram[i] * (duration - low_boundary) / (high_boundary - low_boundary);
break; // it's last interval within specified time range
} else {
count += reg->histogram[i];
}
}
return count;
}
@@ -159,22 +169,35 @@ getMaximum(const HyperLogLogRegister* reg, TimestampTz since, time_t duration, d
{
uint8 max = 0;
size_t i, j;
uint32_t total_count = 0;
for (i = 0; i < HLL_C_BITS + 1; i++)
{
total_count += getAccessCount(&reg[i], duration);
}
if (total_count != 0)
if (min_hit_ratio == 1.0)
{
for (i = 0; i < HLL_C_BITS + 1; i++)
{
if (reg[i].ts >= since && 1.0 - (double)getAccessCount(&reg[i], duration) / total_count <= min_hit_ratio)
if (reg[i].ts >= since)
{
max = i;
}
}
}
else
{
uint32_t total_count = 0;
for (i = 0; i < HLL_C_BITS + 1; i++)
{
total_count += getAccessCount(&reg[i], duration);
}
if (total_count != 0)
{
for (i = 0; i < HLL_C_BITS + 1; i++)
{
// Take in account only bits with access frequncy exceeding maximal miss rate (1 - hit rate)
if (reg[i].ts >= since && 1.0 - (double)getAccessCount(&reg[i], duration) / total_count <= min_hit_ratio)
{
max = i;
}
}
}
}
return max;
}

View File

@@ -55,8 +55,8 @@
/*
* Number of histogram cells. We use exponential histogram with first interval
* equals to one minutes. Autoscaler request LFC statistic with intervals 1,2,...,60 seconds,
* so 1^8=64 seems to be enough for our needs.
* equals to one minutes. Autoscaler request LFC statistic with intervals 1,2,...,60 minutes
* so 2^8=64 seems to be enough for our needs.
*/
#define HIST_SIZE 8
#define HIST_MIN_INTERVAL 60 /* seconds */
@@ -77,15 +77,12 @@
* modified timestamp >= the query timestamp. This value is the number of bits
* for this register in the normal HLL calculation.
*
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
* Usage could be halved if we decide to reduce the required time dimension
* precision; as 32 bits in second precision should be enough for statistics.
* However, that is not yet implemented.
* The memory usage is 2^B * (C + 1) * sizeof(HyperLogLogRegister), or 920kiB.
*/
typedef struct
{
TimestampTz ts; /* last access timestamp */
uint32_t histogram[HIST_SIZE]; /* access counter histogram */
uint32_t histogram[HIST_SIZE]; /* access counter exponential histogram */
} HyperLogLogRegister;
typedef struct HyperLogLogState

View File

@@ -137,8 +137,8 @@ def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
cur.execute(
"create table t_small(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
)
cur.execute("insert into t_huge(pk) values (generate_series(1,1000000))")
cur.execute("insert into t_small(pk) values (generate_series(1,100000))")
cur.execute("insert into t_huge(pk) values (generate_series(1,1000000))") # table size is 21277 pages
cur.execute("insert into t_small(pk) values (generate_series(1,100000))") # table size is 2128 pages
time.sleep(2)
before = time.monotonic()
for _ in range(100):
@@ -152,4 +152,4 @@ def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
optimal_cache_size = cur.fetchall()[0][0]
log.info(f"Optimal cache size for 99% hit rate {optimal_cache_size}")
assert ws_estimation >= 20000 and ws_estimation <= 30000
assert optimal_cache_size >= 2000 and optimal_cache_size <= 7000
assert optimal_cache_size >= 2000 and optimal_cache_size <= 3000