mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 17:32:56 +00:00
Add comments and update test
This commit is contained in:
@@ -6,7 +6,7 @@
|
||||
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* Implements https://hal.science/hal-00465313/document
|
||||
*
|
||||
*
|
||||
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
|
||||
* suited to estimating the cardinality of very large sets; in particular, we
|
||||
* have not attempted to further optimize the implementation as described in
|
||||
@@ -132,7 +132,7 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
|
||||
int64_t delta = (now - cState->regs[index][count].ts)/USECS_PER_SEC;
|
||||
uint32_t new_histogram[HIST_SIZE] = {0};
|
||||
for (int i = 0; i < HIST_SIZE; i++) {
|
||||
/* Use average point of interval */
|
||||
/* Use middle point of interval */
|
||||
uint32 interval_log2 = pg_ceil_log2_32((delta + (HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2)) / HIST_MIN_INTERVAL);
|
||||
uint32 cell = Min(interval_log2, HIST_SIZE-1);
|
||||
new_histogram[cell] += cState->regs[index][count].histogram[i];
|
||||
@@ -140,16 +140,26 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
|
||||
memcpy(cState->regs[index][count].histogram, new_histogram, sizeof new_histogram);
|
||||
}
|
||||
cState->regs[index][count].ts = now;
|
||||
cState->regs[index][count].histogram[0] += 1;
|
||||
cState->regs[index][count].histogram[0] += 1; // most recent access always goes to first histogram backet
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
getAccessCount(const HyperLogLogRegister* reg, time_t duration)
|
||||
{
|
||||
uint32_t count = 0;
|
||||
//for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2 <= duration; i++) {
|
||||
for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
|
||||
count += reg->histogram[i];
|
||||
// Simplest solution is to take in account all points fro overlapped interval
|
||||
// for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
|
||||
for (size_t i = 0; i < HIST_SIZE; i++) {
|
||||
uint32_t high_boundary = HIST_MIN_INTERVAL*(1 << i);
|
||||
uint32_t low_boundary = HIST_MIN_INTERVAL*((1 << i)/2);
|
||||
if (high_boundary >= duration) {
|
||||
// Assume uniform distribution of points within interval and use proportional number of points
|
||||
Assert(duration >= low_boundary);
|
||||
count += reg->histogram[i] * (duration - low_boundary) / (high_boundary - low_boundary);
|
||||
break; // it's last interval within specified time range
|
||||
} else {
|
||||
count += reg->histogram[i];
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
@@ -159,22 +169,35 @@ getMaximum(const HyperLogLogRegister* reg, TimestampTz since, time_t duration, d
|
||||
{
|
||||
uint8 max = 0;
|
||||
size_t i, j;
|
||||
uint32_t total_count = 0;
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
total_count += getAccessCount(®[i], duration);
|
||||
}
|
||||
if (total_count != 0)
|
||||
if (min_hit_ratio == 1.0)
|
||||
{
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
if (reg[i].ts >= since && 1.0 - (double)getAccessCount(®[i], duration) / total_count <= min_hit_ratio)
|
||||
if (reg[i].ts >= since)
|
||||
{
|
||||
max = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
uint32_t total_count = 0;
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
total_count += getAccessCount(®[i], duration);
|
||||
}
|
||||
if (total_count != 0)
|
||||
{
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
// Take in account only bits with access frequncy exceeding maximal miss rate (1 - hit rate)
|
||||
if (reg[i].ts >= since && 1.0 - (double)getAccessCount(®[i], duration) / total_count <= min_hit_ratio)
|
||||
{
|
||||
max = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
|
||||
@@ -55,8 +55,8 @@
|
||||
|
||||
/*
|
||||
* Number of histogram cells. We use exponential histogram with first interval
|
||||
* equals to one minutes. Autoscaler request LFC statistic with intervals 1,2,...,60 seconds,
|
||||
* so 1^8=64 seems to be enough for our needs.
|
||||
* equals to one minutes. Autoscaler request LFC statistic with intervals 1,2,...,60 minutes
|
||||
* so 2^8=64 seems to be enough for our needs.
|
||||
*/
|
||||
#define HIST_SIZE 8
|
||||
#define HIST_MIN_INTERVAL 60 /* seconds */
|
||||
@@ -77,15 +77,12 @@
|
||||
* modified timestamp >= the query timestamp. This value is the number of bits
|
||||
* for this register in the normal HLL calculation.
|
||||
*
|
||||
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
|
||||
* Usage could be halved if we decide to reduce the required time dimension
|
||||
* precision; as 32 bits in second precision should be enough for statistics.
|
||||
* However, that is not yet implemented.
|
||||
* The memory usage is 2^B * (C + 1) * sizeof(HyperLogLogRegister), or 920kiB.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
TimestampTz ts; /* last access timestamp */
|
||||
uint32_t histogram[HIST_SIZE]; /* access counter histogram */
|
||||
uint32_t histogram[HIST_SIZE]; /* access counter exponential histogram */
|
||||
} HyperLogLogRegister;
|
||||
|
||||
typedef struct HyperLogLogState
|
||||
|
||||
@@ -137,8 +137,8 @@ def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
|
||||
cur.execute(
|
||||
"create table t_small(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
|
||||
)
|
||||
cur.execute("insert into t_huge(pk) values (generate_series(1,1000000))")
|
||||
cur.execute("insert into t_small(pk) values (generate_series(1,100000))")
|
||||
cur.execute("insert into t_huge(pk) values (generate_series(1,1000000))") # table size is 21277 pages
|
||||
cur.execute("insert into t_small(pk) values (generate_series(1,100000))") # table size is 2128 pages
|
||||
time.sleep(2)
|
||||
before = time.monotonic()
|
||||
for _ in range(100):
|
||||
@@ -152,4 +152,4 @@ def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
|
||||
optimal_cache_size = cur.fetchall()[0][0]
|
||||
log.info(f"Optimal cache size for 99% hit rate {optimal_cache_size}")
|
||||
assert ws_estimation >= 20000 and ws_estimation <= 30000
|
||||
assert optimal_cache_size >= 2000 and optimal_cache_size <= 7000
|
||||
assert optimal_cache_size >= 2000 and optimal_cache_size <= 3000
|
||||
|
||||
Reference in New Issue
Block a user