implement rolling hyper-log-log algorithm (#8068)

## Problem

See #7466

## Summary of changes

Implement algorithm descried in
https://hal.science/hal-00465313/document

Now new GUC is added:
`neon.wss_max_duration` which specifies size of sliding window (in
seconds). Default value is 1 hour.

It is possible to request estimation of working set sizes (within this
window using new function
`approximate_working_set_size_seconds`. Old function
`approximate_working_set_size` is preserved for backward compatibility.
But its scope is also limited by `neon.wss_max_duration`.

Version of Neon extension is changed to 1.4

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Matthias van de Meent <matthias@neon.tech>
This commit is contained in:
Konstantin Knizhnik
2024-07-04 22:03:58 +03:00
committed by Vlad Lazar
parent 8e5a6808f5
commit e7da970424
8 changed files with 363 additions and 17 deletions

View File

@@ -1,3 +1,4 @@
import time
from pathlib import Path
from fixtures.log_helper import log
@@ -72,3 +73,46 @@ WITH (fillfactor='100');
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 10
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
env = neon_simple_env
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=[
"autovacuum = off",
"shared_buffers=1MB",
"neon.max_file_cache_size=256MB",
"neon.file_cache_size_limit=245MB",
],
)
conn = endpoint.connect()
cur = conn.cursor()
cur.execute("create extension neon version '1.4'")
cur.execute(
"create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
)
cur.execute("insert into t (pk) values (generate_series(1,1000000))")
time.sleep(2)
before_10k = time.monotonic()
cur.execute("select sum(count) from t where pk between 10000 and 20000")
time.sleep(2)
before_1k = time.monotonic()
cur.execute("select sum(count) from t where pk between 1000 and 2000")
after = time.monotonic()
cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})")
estimation_1k = cur.fetchall()[0][0]
log.info(f"Working set size for selecting 1k records {estimation_1k}")
cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})")
estimation_10k = cur.fetchall()[0][0]
log.info(f"Working set size for selecting 10k records {estimation_10k}")
cur.execute("select pg_table_size('t')")
size = cur.fetchall()[0][0] // 8192
log.info(f"Table size {size} blocks")
assert estimation_1k >= 20 and estimation_1k <= 40
assert estimation_10k >= 200 and estimation_10k <= 400

View File

@@ -50,7 +50,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
# Ensure that the default version is also updated in the neon.control file
assert cur.fetchone() == ("1.3",)
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
all_versions = ["1.3", "1.2", "1.1", "1.0"]
all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
current_version = "1.3"
for idx, begin_version in enumerate(all_versions):
for target_version in all_versions[idx + 1 :]: