Files
neon/test_runner/regress/test_pageserver_getpage_throttle.py
Conrad Ludgate a644f01b6a proxy+pageserver: shared leaky bucket impl (#8539)
In proxy I switched to a leaky-bucket impl using the GCRA algorithm. I
figured I could share the code with pageserver and remove the
leaky_bucket crate dependency with some very basic tokio timers and
queues for fairness.

The underlying algorithm should be fairly clear how it works from the
comments I have left in the code.

---

In benchmarking pageserver, @problame found that the new implementation
fixes a getpage throughput discontinuity in pageserver under the
`pagebench get-page-latest-lsn` benchmark with the clickbench dataset
(`test_perf_olap.py`).
The discontinuity is that for any of `--num-clients={2,3,4}`, getpage
throughput remains 10k.
With `--num-clients=5` and greater, getpage throughput then jumps to the
configured 20k rate limit.
With the changes in this PR, the discontinuity is gone, and we scale
throughput linearly to `--num-clients` until the configured rate limit.

More context in
https://github.com/neondatabase/cloud/issues/16886#issuecomment-2315257641.

closes https://github.com/neondatabase/cloud/issues/16886

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
2024-08-29 11:26:52 +00:00

175 lines
6.0 KiB
Python

import copy
import json
import uuid
from anyio import Path
from fixtures.common_types import TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
from fixtures.pg_version import PgVersion
from fixtures.utils import wait_until
def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
env.pageserver.tenant_detach(env.initial_tenant)
env.pageserver.allowed_errors.append(
# https://github.com/neondatabase/neon/issues/6925
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
)
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
rate_limit_rps = 100
compaction_period = 5
env.pageserver.tenant_create(
tenant_id,
conf={
"compaction_period": f"{compaction_period}s",
"timeline_get_throttle": {
"task_kinds": ["PageRequestHandler"],
"initial": 0,
"refill_interval": "100ms",
"refill_amount": int(rate_limit_rps / 10),
"max": int(rate_limit_rps / 10),
"fair": True,
},
},
)
ps_http = env.pageserver.http_client()
ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id)
def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int):
cmd = [
str(env.neon_binpath / "pagebench"),
"get-page-latest-lsn",
"--mgmt-api-endpoint",
ps_http.base_url,
"--page-service-connstring",
env.pageserver.connstr(password=None),
"--runtime",
f"{duration_secs}s",
f"{tenant_id}/{timeline_id}",
]
basepath = pg_bin.run_capture(cmd, with_command_header=False)
results_path = Path(basepath + ".stdout")
log.info(f"Benchmark results at: {results_path}")
with open(results_path, "r") as f:
results = json.load(f)
log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
return int(results["total"]["request_count"])
log.info("warmup / make sure metrics are present")
run_pagebench_at_max_speed_and_get_total_requests_completed(2)
metrics_query = {
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"smgr_query_type": "get_page_at_lsn",
}
metric_name = "pageserver_smgr_query_seconds_sum"
smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
assert smgr_query_seconds_pre is not None
marker = uuid.uuid4().hex
ps_http.post_tracing_event("info", marker)
_, marker_offset = wait_until(
10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
)
log.info("run pagebench")
duration_secs = 10
actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
log.info("validate the client is capped at the configured rps limit")
expect_ncompleted = duration_secs * rate_limit_rps
delta_abs = abs(expect_ncompleted - actual_ncompleted)
threshold = 0.05 * expect_ncompleted
assert (
threshold / rate_limit_rps < 0.1 * duration_secs
), "test self-test: unrealistic expecations regarding precision in this test"
assert (
delta_abs < 0.05 * expect_ncompleted
), "the throttling deviates more than 5percent from the expectation"
log.info("validate that we logged the throttling")
wait_until(
10,
compaction_period / 10,
lambda: env.pageserver.assert_log_contains(
f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
offset=marker_offset,
),
)
log.info("validate that the metric doesn't include throttle wait time")
smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
assert smgr_query_seconds_post is not None
actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
assert (
duration_secs >= 10 * actual_smgr_query_seconds
), "smgr metrics should not include throttle wait time"
throttle_config_with_field_fair_set = {
"task_kinds": ["PageRequestHandler"],
"fair": True,
"initial": 27,
"refill_interval": "43s",
"refill_amount": 23,
"max": 42,
}
def assert_throttle_config_with_field_fair_set(conf):
"""
Field `fair` is ignored, so, responses don't contain it
"""
without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
without_fair.pop("fair")
assert conf == without_fair
def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
"""
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# with_fair config should still be settable
ps_http.set_tenant_config(
env.initial_tenant,
{"timeline_get_throttle": throttle_config_with_field_fair_set},
)
conf = ps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
assert_throttle_config_with_field_fair_set(
conf.tenant_specific_overrides["timeline_get_throttle"]
)
def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
neon_env_builder: NeonEnvBuilder,
):
"""
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
def set_tenant_config(ps_cfg):
ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
neon_env_builder.pageserver_config_override = set_tenant_config
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
conf = ps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])