mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
implement rolling hyper-log-log algorithm (#8068)
## Problem See #7466 ## Summary of changes Implement algorithm descried in https://hal.science/hal-00465313/document Now new GUC is added: `neon.wss_max_duration` which specifies size of sliding window (in seconds). Default value is 1 hour. It is possible to request estimation of working set sizes (within this window using new function `approximate_working_set_size_seconds`. Old function `approximate_working_set_size` is preserved for backward compatibility. But its scope is also limited by `neon.wss_max_duration`. Version of Neon extension is changed to 1.4 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Matthias van de Meent <matthias@neon.tech>
This commit is contained in:
committed by
GitHub
parent
adde0ecfe0
commit
88b13d4552
@@ -6,6 +6,7 @@ OBJS = \
|
||||
$(WIN32RES) \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_utils.o \
|
||||
@@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include "miscadmin.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "lib/hyperloglog.h"
|
||||
#include "pgstat.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include RELFILEINFO_HDR
|
||||
@@ -40,6 +39,8 @@
|
||||
#include "utils/dynahash.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "hll.h"
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||
@@ -62,7 +63,6 @@
|
||||
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define HYPER_LOG_LOG_BIT_WIDTH 10
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
@@ -87,8 +87,7 @@ typedef struct FileCacheControl
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
hyperLogLogState wss_estimation; /* estimation of wroking set size */
|
||||
uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
|
||||
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||
} FileCacheControl;
|
||||
|
||||
static HTAB *lfc_hash;
|
||||
@@ -238,12 +237,7 @@ lfc_shmem_startup(void)
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
|
||||
/* Initialize hyper-log-log structure for estimating working set size */
|
||||
initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
|
||||
|
||||
/* We need hashes in shared memory */
|
||||
pfree(lfc_ctl->wss_estimation.hashesArr);
|
||||
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
|
||||
lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
|
||||
initSHLL(&lfc_ctl->wss_estimation);
|
||||
|
||||
/* Recreate file cache on restart */
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
@@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
/* Approximate working set */
|
||||
tag.blockNum = blkno;
|
||||
addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
|
||||
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
|
||||
{
|
||||
@@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
|
||||
|
||||
Datum
|
||||
approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
|
||||
{
|
||||
if (lfc_size_limit != 0)
|
||||
{
|
||||
int32 dc;
|
||||
time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
|
||||
LWLockRelease(lfc_lock);
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size);
|
||||
|
||||
Datum
|
||||
approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 dc = -1;
|
||||
if (lfc_size_limit != 0)
|
||||
{
|
||||
int32 dc;
|
||||
bool reset = PG_GETARG_BOOL(0);
|
||||
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
|
||||
dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
|
||||
if (reset)
|
||||
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
|
||||
memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
|
||||
LWLockRelease(lfc_lock);
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
PG_RETURN_INT32(dc);
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
193
pgxn/neon/hll.c
Normal file
193
pgxn/neon/hll.c
Normal file
@@ -0,0 +1,193 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* hll.c
|
||||
* Sliding HyperLogLog cardinality estimator
|
||||
*
|
||||
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* Implements https://hal.science/hal-00465313/document
|
||||
*
|
||||
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
|
||||
* suited to estimating the cardinality of very large sets; in particular, we
|
||||
* have not attempted to further optimize the implementation as described in
|
||||
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
|
||||
* Engineering of a State of The Art Cardinality Estimation Algorithm".
|
||||
*
|
||||
* A sparse representation of HyperLogLog state is used, with fixed space
|
||||
* overhead.
|
||||
*
|
||||
* The copyright terms of Ohno's original version (the MIT license) follow.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/lib/hyperloglog.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the 'Software'), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "postgres.h"
|
||||
#include "funcapi.h"
|
||||
#include "port/pg_bitutils.h"
|
||||
#include "utils/timestamp.h"
|
||||
#include "hll.h"
|
||||
|
||||
|
||||
#define POW_2_32 (4294967296.0)
|
||||
#define NEG_POW_2_32 (-4294967296.0)
|
||||
|
||||
#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
|
||||
|
||||
/*
|
||||
* Worker for addHyperLogLog().
|
||||
*
|
||||
* Calculates the position of the first set bit in first b bits of x argument
|
||||
* starting from the first, reading from most significant to least significant
|
||||
* bits.
|
||||
*
|
||||
* Example (when considering fist 10 bits of x):
|
||||
*
|
||||
* rho(x = 0b1000000000) returns 1
|
||||
* rho(x = 0b0010000000) returns 3
|
||||
* rho(x = 0b0000000000) returns b + 1
|
||||
*
|
||||
* "The binary address determined by the first b bits of x"
|
||||
*
|
||||
* Return value "j" used to index bit pattern to watch.
|
||||
*/
|
||||
static inline uint8
|
||||
rho(uint32 x, uint8 b)
|
||||
{
|
||||
uint8 j = 1;
|
||||
|
||||
if (x == 0)
|
||||
return b + 1;
|
||||
|
||||
j = 32 - pg_leftmost_one_pos32(x);
|
||||
|
||||
if (j > b)
|
||||
return b + 1;
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize HyperLogLog track state
|
||||
*/
|
||||
void
|
||||
initSHLL(HyperLogLogState *cState)
|
||||
{
|
||||
memset(cState->regs, 0, sizeof(cState->regs));
|
||||
}
|
||||
|
||||
/*
|
||||
* Adds element to the estimator, from caller-supplied hash.
|
||||
*
|
||||
* It is critical that the hash value passed be an actual hash value, typically
|
||||
* generated using hash_any(). The algorithm relies on a specific bit-pattern
|
||||
* observable in conjunction with stochastic averaging. There must be a
|
||||
* uniform distribution of bits in hash values for each distinct original value
|
||||
* observed.
|
||||
*/
|
||||
void
|
||||
addSHLL(HyperLogLogState *cState, uint32 hash)
|
||||
{
|
||||
uint8 count;
|
||||
uint32 index;
|
||||
size_t i;
|
||||
size_t j;
|
||||
|
||||
TimestampTz now = GetCurrentTimestamp();
|
||||
/* Use the first "k" (registerWidth) bits as a zero based index */
|
||||
index = hash >> HLL_C_BITS;
|
||||
|
||||
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
|
||||
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
|
||||
|
||||
cState->regs[index][count] = now;
|
||||
}
|
||||
|
||||
static uint8
|
||||
getMaximum(const TimestampTz* reg, TimestampTz since)
|
||||
{
|
||||
uint8 max = 0;
|
||||
|
||||
for (size_t i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
if (reg[i] >= since)
|
||||
{
|
||||
max = i;
|
||||
}
|
||||
}
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Estimates cardinality, based on elements added so far
|
||||
*/
|
||||
double
|
||||
estimateSHLL(HyperLogLogState *cState, time_t duration)
|
||||
{
|
||||
double result;
|
||||
double sum = 0.0;
|
||||
size_t i;
|
||||
uint8 R[HLL_N_REGISTERS];
|
||||
/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
|
||||
TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
|
||||
|
||||
for (i = 0; i < HLL_N_REGISTERS; i++)
|
||||
{
|
||||
R[i] = getMaximum(cState->regs[i], since);
|
||||
sum += 1.0 / pow(2.0, R[i]);
|
||||
}
|
||||
|
||||
/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
|
||||
result = ALPHA_MM / sum;
|
||||
|
||||
if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
|
||||
{
|
||||
/* Small range correction */
|
||||
int zero_count = 0;
|
||||
|
||||
for (i = 0; i < HLL_N_REGISTERS; i++)
|
||||
{
|
||||
zero_count += R[i] == 0;
|
||||
}
|
||||
|
||||
if (zero_count != 0)
|
||||
result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
|
||||
zero_count);
|
||||
}
|
||||
else if (result > (1.0 / 30.0) * POW_2_32)
|
||||
{
|
||||
/* Large range correction */
|
||||
result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
86
pgxn/neon/hll.h
Normal file
86
pgxn/neon/hll.h
Normal file
@@ -0,0 +1,86 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* hll.h
|
||||
* Sliding HyperLogLog cardinality estimator
|
||||
*
|
||||
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* Implements https://hal.science/hal-00465313/document
|
||||
*
|
||||
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
|
||||
* suited to estimating the cardinality of very large sets; in particular, we
|
||||
* have not attempted to further optimize the implementation as described in
|
||||
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
|
||||
* Engineering of a State of The Art Cardinality Estimation Algorithm".
|
||||
*
|
||||
* A sparse representation of HyperLogLog state is used, with fixed space
|
||||
* overhead.
|
||||
*
|
||||
* The copyright terms of Ohno's original version (the MIT license) follow.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/lib/hyperloglog.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the 'Software'), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef HLL_H
|
||||
#define HLL_H
|
||||
|
||||
#define HLL_BIT_WIDTH 10
|
||||
#define HLL_C_BITS (32 - HLL_BIT_WIDTH)
|
||||
#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
|
||||
|
||||
/*
|
||||
* HyperLogLog is an approximate technique for computing the number of distinct
|
||||
* entries in a set. Importantly, it does this by using a fixed amount of
|
||||
* memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
|
||||
* cardinality estimation algorithm" for more.
|
||||
*
|
||||
* Instead of a single counter for every bits register, we have a timestamp
|
||||
* for every valid number of bits we can encounter. Every time we encounter
|
||||
* a certain number of bits, we update the timestamp in those registers to
|
||||
* the current timestamp.
|
||||
*
|
||||
* We can query the sketch's stored cardinality for the range of some timestamp
|
||||
* up to now: For each register, we return the highest bits bucket that has a
|
||||
* modified timestamp >= the query timestamp. This value is the number of bits
|
||||
* for this register in the normal HLL calculation.
|
||||
*
|
||||
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
|
||||
* Usage could be halved if we decide to reduce the required time dimension
|
||||
* precision; as 32 bits in second precision should be enough for statistics.
|
||||
* However, that is not yet implemented.
|
||||
*/
|
||||
typedef struct HyperLogLogState
|
||||
{
|
||||
TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
|
||||
} HyperLogLogState;
|
||||
|
||||
extern void initSHLL(HyperLogLogState *cState);
|
||||
extern void addSHLL(HyperLogLogState *cState, uint32 hash);
|
||||
extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
|
||||
|
||||
#endif
|
||||
9
pgxn/neon/neon--1.3--1.4.sql
Normal file
9
pgxn/neon/neon--1.3--1.4.sql
Normal file
@@ -0,0 +1,9 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null)
|
||||
RETURNS integer
|
||||
AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor;
|
||||
|
||||
1
pgxn/neon/neon--1.4--1.3.sql
Normal file
1
pgxn/neon/neon--1.4--1.3.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;
|
||||
Reference in New Issue
Block a user