mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 21:50:37 +00:00
page_service: batching observability & include throttled time in smgr metrics (#9870)
This PR - fixes smgr metrics https://github.com/neondatabase/neon/issues/9925 - adds an additional startup log line logging the current batching config - adds a histogram of batch sizes global and per-tenant - adds a metric exposing the current batching config The issue described #9925 is that before this PR, request latency was only observed *after* batching. This means that smgr latency metrics (most importantly getpage latency) don't account for - `wait_lsn` time - time spent waiting for batch to fill up / the executor stage to pick up the batch. The fix is to use a per-request batching timer, like we did before the initial batching PR. We funnel those timers through the entire request lifecycle. I noticed that even before the initial batching changes, we weren't accounting for the time spent writing & flushing the response to the wire. This PR drive-by fixes that deficiency by dropping the timers at the very end of processing the batch, i.e., after the `pgb.flush()` call. I was **unable to maintain the behavior that we deduct time-spent-in-throttle from various latency metrics. The reason is that we're using a *single* counter in `RequestContext` to track micros spent in throttle. But there are *N* metrics timers in the batch, one per request. As a consequence, the practice of consuming the counter in the drop handler of each timer no longer works because all but the first timer will encounter error `close() called on closed state`. A failed attempt to maintain the current behavior can be found in https://github.com/neondatabase/neon/pull/9951. So, this PR remvoes the deduction behavior from all metrics. I started a discussion on Slack about it the implications this has for our internal SLO calculation: https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029 # Refs - fixes https://github.com/neondatabase/neon/issues/9925 - sub-issue https://github.com/neondatabase/neon/issues/9377 - epic: https://github.com/neondatabase/neon/issues/9376
This commit is contained in:
committed by
Ivan Efremov
parent
8963ac85f9
commit
dd76f1eeee
@@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
counter("pageserver_tenant_throttling_count_accounted_finish"),
|
||||
counter("pageserver_tenant_throttling_wait_usecs_sum"),
|
||||
counter("pageserver_tenant_throttling_count"),
|
||||
*histogram("pageserver_page_service_batch_size"),
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# "pageserver_directory_entries_count", -- only used if above a certain threshold
|
||||
# "pageserver_broken_tenants_count" -- used only for broken
|
||||
|
||||
@@ -167,18 +167,18 @@ def test_throughput(
|
||||
@dataclass
|
||||
class Metrics:
|
||||
time: float
|
||||
pageserver_getpage_count: float
|
||||
pageserver_vectored_get_count: float
|
||||
pageserver_batch_size_histo_sum: float
|
||||
pageserver_batch_size_histo_count: float
|
||||
compute_getpage_count: float
|
||||
pageserver_cpu_seconds_total: float
|
||||
|
||||
def __sub__(self, other: "Metrics") -> "Metrics":
|
||||
return Metrics(
|
||||
time=self.time - other.time,
|
||||
pageserver_getpage_count=self.pageserver_getpage_count
|
||||
- other.pageserver_getpage_count,
|
||||
pageserver_vectored_get_count=self.pageserver_vectored_get_count
|
||||
- other.pageserver_vectored_get_count,
|
||||
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
|
||||
- other.pageserver_batch_size_histo_sum,
|
||||
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
|
||||
- other.pageserver_batch_size_histo_count,
|
||||
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
|
||||
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
|
||||
- other.pageserver_cpu_seconds_total,
|
||||
@@ -187,8 +187,8 @@ def test_throughput(
|
||||
def normalize(self, by) -> "Metrics":
|
||||
return Metrics(
|
||||
time=self.time / by,
|
||||
pageserver_getpage_count=self.pageserver_getpage_count / by,
|
||||
pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
|
||||
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
|
||||
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
|
||||
compute_getpage_count=self.compute_getpage_count / by,
|
||||
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
|
||||
)
|
||||
@@ -202,11 +202,11 @@ def test_throughput(
|
||||
pageserver_metrics = ps_http.get_metrics()
|
||||
return Metrics(
|
||||
time=time.time(),
|
||||
pageserver_getpage_count=pageserver_metrics.query_one(
|
||||
"pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
|
||||
pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
|
||||
"pageserver_page_service_batch_size_sum"
|
||||
).value,
|
||||
pageserver_vectored_get_count=pageserver_metrics.query_one(
|
||||
"pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
|
||||
pageserver_batch_size_histo_count=pageserver_metrics.query_one(
|
||||
"pageserver_page_service_batch_size_count"
|
||||
).value,
|
||||
compute_getpage_count=compute_getpage_count,
|
||||
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
|
||||
@@ -243,7 +243,7 @@ def test_throughput(
|
||||
# Sanity-checks on the collected data
|
||||
#
|
||||
# assert that getpage counts roughly match between compute and ps
|
||||
assert metrics.pageserver_getpage_count == pytest.approx(
|
||||
assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
|
||||
metrics.compute_getpage_count, rel=0.01
|
||||
)
|
||||
|
||||
@@ -256,7 +256,7 @@ def test_throughput(
|
||||
|
||||
zenbenchmark.record(
|
||||
"perfmetric.batching_factor",
|
||||
metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
|
||||
metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
|
||||
unit="",
|
||||
report=MetricReport.HIGHER_IS_BETTER,
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ import copy
|
||||
import json
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
from anyio import Path
|
||||
from fixtures.common_types import TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
@@ -70,14 +71,21 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
|
||||
log.info("warmup / make sure metrics are present")
|
||||
run_pagebench_at_max_speed_and_get_total_requests_completed(2)
|
||||
metrics_query = {
|
||||
smgr_metrics_query = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"smgr_query_type": "get_page_at_lsn",
|
||||
}
|
||||
metric_name = "pageserver_smgr_query_seconds_sum"
|
||||
smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
|
||||
smgr_metric_name = "pageserver_smgr_query_seconds_sum"
|
||||
throttle_metrics_query = {
|
||||
"tenant_id": str(tenant_id),
|
||||
}
|
||||
throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
|
||||
|
||||
smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
|
||||
assert smgr_query_seconds_pre is not None
|
||||
throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
|
||||
assert throttled_usecs_pre is not None
|
||||
|
||||
marker = uuid.uuid4().hex
|
||||
ps_http.post_tracing_event("info", marker)
|
||||
@@ -108,14 +116,23 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
timeout=compaction_period,
|
||||
)
|
||||
|
||||
log.info("validate that the metric doesn't include throttle wait time")
|
||||
smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
|
||||
log.info("the smgr metric includes throttle time")
|
||||
smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
|
||||
assert smgr_query_seconds_post is not None
|
||||
throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
|
||||
assert throttled_usecs_post is not None
|
||||
actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
|
||||
actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
|
||||
actual_throttled_secs = actual_throttled_usecs / 1_000_000
|
||||
|
||||
assert (
|
||||
duration_secs >= 10 * actual_smgr_query_seconds
|
||||
), "smgr metrics should not include throttle wait time"
|
||||
pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
|
||||
), "smgr metrics include throttle wait time"
|
||||
smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
|
||||
assert smgr_ex_throttle > 0
|
||||
assert (
|
||||
duration_secs > 10 * smgr_ex_throttle
|
||||
), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
|
||||
|
||||
|
||||
throttle_config_with_field_fair_set = {
|
||||
|
||||
Reference in New Issue
Block a user