proxy+pageserver: shared leaky bucket impl (#8539)

In proxy I switched to a leaky-bucket impl using the GCRA algorithm. I
figured I could share the code with pageserver and remove the
leaky_bucket crate dependency with some very basic tokio timers and
queues for fairness.

The underlying algorithm should be fairly clear how it works from the
comments I have left in the code.

---

In benchmarking pageserver, @problame found that the new implementation
fixes a getpage throughput discontinuity in pageserver under the
`pagebench get-page-latest-lsn` benchmark with the clickbench dataset
(`test_perf_olap.py`).
The discontinuity is that for any of `--num-clients={2,3,4}`, getpage
throughput remains 10k.
With `--num-clients=5` and greater, getpage throughput then jumps to the
configured 20k rate limit.
With the changes in this PR, the discontinuity is gone, and we scale
throughput linearly to `--num-clients` until the configured rate limit.

More context in
https://github.com/neondatabase/cloud/issues/16886#issuecomment-2315257641.

closes https://github.com/neondatabase/cloud/issues/16886

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
Conrad Ludgate
2024-08-29 12:26:52 +01:00
committed by GitHub
parent c2f8fdccd7
commit a644f01b6a
12 changed files with 395 additions and 114 deletions

View File

@@ -162,7 +162,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"min_resident_size_override": 23,
"timeline_get_throttle": {
"task_kinds": ["PageRequestHandler"],
"fair": True,
"initial": 0,
"refill_interval": "1s",
"refill_amount": 1000,

View File

@@ -1,3 +1,4 @@
import copy
import json
import uuid
@@ -116,3 +117,58 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
assert (
duration_secs >= 10 * actual_smgr_query_seconds
), "smgr metrics should not include throttle wait time"
throttle_config_with_field_fair_set = {
"task_kinds": ["PageRequestHandler"],
"fair": True,
"initial": 27,
"refill_interval": "43s",
"refill_amount": 23,
"max": 42,
}
def assert_throttle_config_with_field_fair_set(conf):
"""
Field `fair` is ignored, so, responses don't contain it
"""
without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
without_fair.pop("fair")
assert conf == without_fair
def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
"""
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# with_fair config should still be settable
ps_http.set_tenant_config(
env.initial_tenant,
{"timeline_get_throttle": throttle_config_with_field_fair_set},
)
conf = ps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
assert_throttle_config_with_field_fair_set(
conf.tenant_specific_overrides["timeline_get_throttle"]
)
def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
neon_env_builder: NeonEnvBuilder,
):
"""
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
def set_tenant_config(ps_cfg):
ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
neon_env_builder.pageserver_config_override = set_tenant_config
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
conf = ps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])