mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
Before this PR, some override callbacks used `.default()`, others used `.setdefault()`. As of this PR, all callbacks use `.setdefault()` which I think is least prone to failure. Aligning on a single way will set the right example for future tests that need such customization. The `test_pageserver_getpage_throttle.py` technically is a change in behavior: before, it replaced the `tenant_config` field, now it just configures the throttle. This is what I believe is intended anyway.
194 lines
7.0 KiB
Python
194 lines
7.0 KiB
Python
from __future__ import annotations
|
|
|
|
import copy
|
|
import json
|
|
import uuid
|
|
|
|
import pytest
|
|
from anyio import Path
|
|
from fixtures.common_types import TenantId, TimelineId
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.utils import wait_until
|
|
|
|
|
|
def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
|
env = neon_env_builder.init_start()
|
|
|
|
env.pageserver.tenant_detach(env.initial_tenant)
|
|
|
|
env.pageserver.allowed_errors.append(
|
|
# https://github.com/neondatabase/neon/issues/6925
|
|
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
|
|
)
|
|
|
|
tenant_id = TenantId.generate()
|
|
timeline_id = TimelineId.generate()
|
|
|
|
rate_limit_rps = 100
|
|
compaction_period = 5
|
|
env.pageserver.tenant_create(
|
|
tenant_id,
|
|
conf={
|
|
"compaction_period": f"{compaction_period}s",
|
|
"timeline_get_throttle": {
|
|
"task_kinds": [
|
|
"PageRequestHandler"
|
|
], # any non-empty array will do here https://github.com/neondatabase/neon/pull/9962
|
|
"initial": 0,
|
|
"refill_interval": "100ms",
|
|
"refill_amount": int(rate_limit_rps / 10),
|
|
"max": int(rate_limit_rps / 10),
|
|
"fair": True,
|
|
},
|
|
},
|
|
)
|
|
|
|
ps_http = env.pageserver.http_client()
|
|
|
|
ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id)
|
|
|
|
def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int):
|
|
cmd = [
|
|
str(env.neon_binpath / "pagebench"),
|
|
"get-page-latest-lsn",
|
|
"--mgmt-api-endpoint",
|
|
ps_http.base_url,
|
|
"--page-service-connstring",
|
|
env.pageserver.connstr(password=None),
|
|
"--runtime",
|
|
f"{duration_secs}s",
|
|
f"{tenant_id}/{timeline_id}",
|
|
]
|
|
|
|
basepath = pg_bin.run_capture(cmd, with_command_header=False)
|
|
results_path = Path(basepath + ".stdout")
|
|
log.info(f"Benchmark results at: {results_path}")
|
|
|
|
with open(results_path) as f:
|
|
results = json.load(f)
|
|
log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
|
|
return int(results["total"]["request_count"])
|
|
|
|
log.info("warmup / make sure metrics are present")
|
|
run_pagebench_at_max_speed_and_get_total_requests_completed(2)
|
|
smgr_metrics_query = {
|
|
"tenant_id": str(tenant_id),
|
|
"timeline_id": str(timeline_id),
|
|
"smgr_query_type": "get_page_at_lsn",
|
|
}
|
|
smgr_metric_name = "pageserver_smgr_query_seconds_sum"
|
|
throttle_metrics_query = {
|
|
"tenant_id": str(tenant_id),
|
|
}
|
|
throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
|
|
|
|
smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
|
|
assert smgr_query_seconds_pre is not None
|
|
throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
|
|
assert throttled_usecs_pre is not None
|
|
|
|
marker = uuid.uuid4().hex
|
|
ps_http.post_tracing_event("info", marker)
|
|
_, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
|
|
|
|
log.info("run pagebench")
|
|
duration_secs = 10
|
|
actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
|
|
|
|
log.info("validate the client is capped at the configured rps limit")
|
|
expect_ncompleted = duration_secs * rate_limit_rps
|
|
delta_abs = abs(expect_ncompleted - actual_ncompleted)
|
|
threshold = 0.05 * expect_ncompleted
|
|
assert (
|
|
threshold / rate_limit_rps < 0.1 * duration_secs
|
|
), "test self-test: unrealistic expecations regarding precision in this test"
|
|
assert (
|
|
delta_abs < 0.05 * expect_ncompleted
|
|
), "the throttling deviates more than 5percent from the expectation"
|
|
|
|
log.info("validate that we logged the throttling")
|
|
|
|
wait_until(
|
|
lambda: env.pageserver.assert_log_contains(
|
|
f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
|
|
offset=marker_offset,
|
|
),
|
|
timeout=compaction_period,
|
|
)
|
|
|
|
smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
|
|
assert smgr_query_seconds_post is not None
|
|
throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
|
|
assert throttled_usecs_post is not None
|
|
actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
|
|
actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
|
|
actual_throttled_secs = actual_throttled_usecs / 1_000_000
|
|
|
|
log.info("validate that the metric doesn't include throttle wait time")
|
|
assert (
|
|
duration_secs >= 10 * actual_smgr_query_seconds
|
|
), "smgr metrics should not include throttle wait time"
|
|
|
|
log.info("validate that the throttling wait time metrics is correct")
|
|
assert (
|
|
pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs
|
|
), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
|
|
|
|
|
|
throttle_config_with_field_fair_set = {
|
|
"task_kinds": ["PageRequestHandler"],
|
|
"fair": True,
|
|
"initial": 27,
|
|
"refill_interval": "43s",
|
|
"refill_amount": 23,
|
|
"max": 42,
|
|
}
|
|
|
|
|
|
def assert_throttle_config_with_field_fair_set(conf):
|
|
"""
|
|
Field `fair` is ignored, so, responses don't contain it
|
|
"""
|
|
without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
|
|
without_fair.pop("fair")
|
|
|
|
assert conf == without_fair
|
|
|
|
|
|
def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
|
|
"""
|
|
env = neon_env_builder.init_start()
|
|
vps_http = env.storage_controller.pageserver_api()
|
|
# with_fair config should still be settable
|
|
vps_http.set_tenant_config(
|
|
env.initial_tenant,
|
|
{"timeline_get_throttle": throttle_config_with_field_fair_set},
|
|
)
|
|
conf = vps_http.tenant_config(env.initial_tenant)
|
|
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
|
|
assert_throttle_config_with_field_fair_set(
|
|
conf.tenant_specific_overrides["timeline_get_throttle"]
|
|
)
|
|
|
|
|
|
def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
|
|
"""
|
|
|
|
def set_tenant_config(ps_cfg):
|
|
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
|
tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
|
|
|
|
neon_env_builder.pageserver_config_override = set_tenant_config
|
|
env = neon_env_builder.init_start()
|
|
ps_http = env.pageserver.http_client()
|
|
conf = ps_http.tenant_config(env.initial_tenant)
|
|
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
|