From 65c519f36c8a16e3009623593332c82813ab667d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 23 Sep 2024 09:45:14 +0300 Subject: [PATCH] Use exponential histogram to improve prediction of LFC size --- pgxn/neon/Makefile | 2 +- pgxn/neon/file_cache.c | 22 +++++- pgxn/neon/hll.c | 76 ++++++++++++++++--- pgxn/neon/hll.h | 23 ++++-- pgxn/neon/neon--1.4--1.5.sql | 10 +++ pgxn/neon/neon--1.5--1.4.sql | 1 + .../test_lfc_working_set_approximation.py | 43 +++++++++++ 7 files changed, 158 insertions(+), 19 deletions(-) create mode 100644 pgxn/neon/neon--1.4--1.5.sql create mode 100644 pgxn/neon/neon--1.5--1.4.sql diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 3b755bb042..36dce91bc6 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -23,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index ab6739465b..5516870b4e 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1263,7 +1263,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS) int32 dc; time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); LWLockAcquire(lfc_lock, LW_SHARED); - dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration, 1.0); LWLockRelease(lfc_lock); PG_RETURN_INT32(dc); } @@ -1280,7 +1280,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS) int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1, 1.0); if (reset) memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); @@ -1288,3 +1288,21 @@ approximate_working_set_size(PG_FUNCTION_ARGS) } PG_RETURN_NULL(); } + +PG_FUNCTION_INFO_V1(approximate_optimal_cache_size); + +Datum +approximate_optimal_cache_size(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); + double min_hit_ratio = PG_ARGISNULL(1) ? 1.0 : PG_GETARG_FLOAT8(1); + LWLockAcquire(lfc_lock, LW_SHARED); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration, min_hit_ratio); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index f8496b3125..d57d271506 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group * * Implements https://hal.science/hal-00465313/document - * + * * Based on Hideaki Ohno's C++ implementation. This is probably not ideally * suited to estimating the cardinality of very large sets; in particular, we * have not attempted to further optimize the implementation as described in @@ -126,22 +126,78 @@ addSHLL(HyperLogLogState *cState, uint32 hash) /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - cState->regs[index][count] = now; + if (cState->regs[index][count].ts) + { + /* update histgoram */ + int64_t delta = (now - cState->regs[index][count].ts)/USECS_PER_SEC; + uint32_t new_histogram[HIST_SIZE] = {0}; + for (int i = 0; i < HIST_SIZE; i++) { + /* Use middle point of interval */ + uint32 interval_log2 = pg_ceil_log2_32((delta + (HIST_MIN_INTERVAL*((1<regs[index][count].histogram[i]; + } + memcpy(cState->regs[index][count].histogram, new_histogram, sizeof new_histogram); + } + cState->regs[index][count].ts = now; + cState->regs[index][count].histogram[0] += 1; // most recent access always goes to first histogram backet +} + +static uint32_t +getAccessCount(const HyperLogLogRegister* reg, time_t duration) +{ + uint32_t count = 0; +// Simplest solution is to take in account all points fro overlapped interval +// for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) { + for (size_t i = 0; i < HIST_SIZE; i++) { + uint32_t high_boundary = HIST_MIN_INTERVAL*(1 << i); + uint32_t low_boundary = HIST_MIN_INTERVAL*((1 << i)/2); + if (high_boundary >= duration) { + // Assume uniform distribution of points within interval and use proportional number of points + Assert(duration >= low_boundary); + count += reg->histogram[i] * (duration - low_boundary) / (high_boundary - low_boundary); + break; // it's last interval within specified time range + } else { + count += reg->histogram[i]; + } + } + return count; } static uint8 -getMaximum(const TimestampTz* reg, TimestampTz since) +getMaximum(const HyperLogLogRegister* reg, TimestampTz since, time_t duration, double min_hit_ratio) { uint8 max = 0; - - for (size_t i = 0; i < HLL_C_BITS + 1; i++) + size_t i, j; + if (min_hit_ratio == 1.0) { - if (reg[i] >= since) + for (i = 0; i < HLL_C_BITS + 1; i++) { - max = i; + if (reg[i].ts >= since) + { + max = i; + } + } + } + else + { + uint32_t total_count = 0; + for (i = 0; i < HLL_C_BITS + 1; i++) + { + total_count += getAccessCount(®[i], duration); + } + if (total_count != 0) + { + for (i = 0; i < HLL_C_BITS + 1; i++) + { + // Take in account only bits with access frequncy exceeding maximal miss rate (1 - hit rate) + if (reg[i].ts >= since && 1.0 - (double)getAccessCount(®[i], duration) / total_count <= min_hit_ratio) + { + max = i; + } + } } } - return max; } @@ -150,7 +206,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since) * Estimates cardinality, based on elements added so far */ double -estimateSHLL(HyperLogLogState *cState, time_t duration) +estimateSHLL(HyperLogLogState *cState, time_t duration, double min_hit_ratio) { double result; double sum = 0.0; @@ -161,7 +217,7 @@ estimateSHLL(HyperLogLogState *cState, time_t duration) for (i = 0; i < HLL_N_REGISTERS; i++) { - R[i] = getMaximum(cState->regs[i], since); + R[i] = getMaximum(cState->regs[i], since, duration, min_hit_ratio); sum += 1.0 / pow(2.0, R[i]); } diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h index 9256cb9afa..fb9ac83515 100644 --- a/pgxn/neon/hll.h +++ b/pgxn/neon/hll.h @@ -53,6 +53,14 @@ #define HLL_C_BITS (32 - HLL_BIT_WIDTH) #define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) +/* + * Number of histogram cells. We use exponential histogram with first interval + * equals to one minutes. Autoscaler request LFC statistic with intervals 1,2,...,60 minutes + * so 2^8=64 seems to be enough for our needs. + */ +#define HIST_SIZE 8 +#define HIST_MIN_INTERVAL 60 /* seconds */ + /* * HyperLogLog is an approximate technique for computing the number of distinct * entries in a set. Importantly, it does this by using a fixed amount of @@ -69,18 +77,21 @@ * modified timestamp >= the query timestamp. This value is the number of bits * for this register in the normal HLL calculation. * - * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. - * Usage could be halved if we decide to reduce the required time dimension - * precision; as 32 bits in second precision should be enough for statistics. - * However, that is not yet implemented. + * The memory usage is 2^B * (C + 1) * sizeof(HyperLogLogRegister), or 920kiB. */ +typedef struct +{ + TimestampTz ts; /* last access timestamp */ + uint32_t histogram[HIST_SIZE]; /* access counter exponential histogram */ +} HyperLogLogRegister; + typedef struct HyperLogLogState { - TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; + HyperLogLogRegister regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; } HyperLogLogState; extern void initSHLL(HyperLogLogState *cState); extern void addSHLL(HyperLogLogState *cState, uint32 hash); -extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration, double min_hit_ratio); #endif diff --git a/pgxn/neon/neon--1.4--1.5.sql b/pgxn/neon/neon--1.4--1.5.sql new file mode 100644 index 0000000000..e1c67e8995 --- /dev/null +++ b/pgxn/neon/neon--1.4--1.5.sql @@ -0,0 +1,10 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit + +-- returns minimal LFC cache size (in 8kb pages) provided specified hit rate +CREATE FUNCTION approximate_optimal_cache_size(duration_sec integer default null, min_hit_ration float8 default null) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_optimal_cache_size' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_optimal_cache_size(integer,float8) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.5--1.4.sql b/pgxn/neon/neon--1.5--1.4.sql new file mode 100644 index 0000000000..1fa970d224 --- /dev/null +++ b/pgxn/neon/neon--1.5--1.4.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_optimal_cache_size(integer,float8) CASCADE; diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 4a3a949d1a..738180cffc 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -114,3 +114,46 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): assert estimation_1k >= 20 and estimation_1k <= 40 assert estimation_10k >= 200 and estimation_10k <= 400 + + +def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=256MB", + "neon.file_cache_size_limit=245MB", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon version '1.5'") + cur.execute( + "create table t_huge(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute( + "create table t_small(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute( + "insert into t_huge(pk) values (generate_series(1,1000000))" + ) # table size is 21277 pages + cur.execute( + "insert into t_small(pk) values (generate_series(1,100000))" + ) # table size is 2128 pages + time.sleep(2) + before = time.monotonic() + for _ in range(100): + cur.execute("select sum(count) from t_small") + cur.execute("select sum(count) from t_huge") + after = time.monotonic() + cur.execute(f"select approximate_working_set_size_seconds({int(after - before + 1)})") + ws_estimation = cur.fetchall()[0][0] + log.info(f"Working set size estimaton {ws_estimation}") + cur.execute(f"select approximate_optimal_cache_size({int(after - before + 1)}, 0.99)") + optimal_cache_size = cur.fetchall()[0][0] + log.info(f"Optimal cache size for 99% hit rate {optimal_cache_size}") + assert ws_estimation >= 20000 and ws_estimation <= 30000 + assert optimal_cache_size >= 2000 and optimal_cache_size <= 3000