mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 22:50:38 +00:00
page_service: batching observability & include throttled time in smgr metrics (#9870)
This PR - fixes smgr metrics https://github.com/neondatabase/neon/issues/9925 - adds an additional startup log line logging the current batching config - adds a histogram of batch sizes global and per-tenant - adds a metric exposing the current batching config The issue described #9925 is that before this PR, request latency was only observed *after* batching. This means that smgr latency metrics (most importantly getpage latency) don't account for - `wait_lsn` time - time spent waiting for batch to fill up / the executor stage to pick up the batch. The fix is to use a per-request batching timer, like we did before the initial batching PR. We funnel those timers through the entire request lifecycle. I noticed that even before the initial batching changes, we weren't accounting for the time spent writing & flushing the response to the wire. This PR drive-by fixes that deficiency by dropping the timers at the very end of processing the batch, i.e., after the `pgb.flush()` call. I was **unable to maintain the behavior that we deduct time-spent-in-throttle from various latency metrics. The reason is that we're using a *single* counter in `RequestContext` to track micros spent in throttle. But there are *N* metrics timers in the batch, one per request. As a consequence, the practice of consuming the counter in the drop handler of each timer no longer works because all but the first timer will encounter error `close() called on closed state`. A failed attempt to maintain the current behavior can be found in https://github.com/neondatabase/neon/pull/9951. So, this PR remvoes the deduction behavior from all metrics. I started a discussion on Slack about it the implications this has for our internal SLO calculation: https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029 # Refs - fixes https://github.com/neondatabase/neon/issues/9925 - sub-issue https://github.com/neondatabase/neon/issues/9377 - epic: https://github.com/neondatabase/neon/issues/9376
This commit is contained in:
committed by
Ivan Efremov
parent
8963ac85f9
commit
dd76f1eeee
@@ -167,18 +167,18 @@ def test_throughput(
|
||||
@dataclass
|
||||
class Metrics:
|
||||
time: float
|
||||
pageserver_getpage_count: float
|
||||
pageserver_vectored_get_count: float
|
||||
pageserver_batch_size_histo_sum: float
|
||||
pageserver_batch_size_histo_count: float
|
||||
compute_getpage_count: float
|
||||
pageserver_cpu_seconds_total: float
|
||||
|
||||
def __sub__(self, other: "Metrics") -> "Metrics":
|
||||
return Metrics(
|
||||
time=self.time - other.time,
|
||||
pageserver_getpage_count=self.pageserver_getpage_count
|
||||
- other.pageserver_getpage_count,
|
||||
pageserver_vectored_get_count=self.pageserver_vectored_get_count
|
||||
- other.pageserver_vectored_get_count,
|
||||
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
|
||||
- other.pageserver_batch_size_histo_sum,
|
||||
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
|
||||
- other.pageserver_batch_size_histo_count,
|
||||
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
|
||||
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
|
||||
- other.pageserver_cpu_seconds_total,
|
||||
@@ -187,8 +187,8 @@ def test_throughput(
|
||||
def normalize(self, by) -> "Metrics":
|
||||
return Metrics(
|
||||
time=self.time / by,
|
||||
pageserver_getpage_count=self.pageserver_getpage_count / by,
|
||||
pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
|
||||
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
|
||||
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
|
||||
compute_getpage_count=self.compute_getpage_count / by,
|
||||
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
|
||||
)
|
||||
@@ -202,11 +202,11 @@ def test_throughput(
|
||||
pageserver_metrics = ps_http.get_metrics()
|
||||
return Metrics(
|
||||
time=time.time(),
|
||||
pageserver_getpage_count=pageserver_metrics.query_one(
|
||||
"pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
|
||||
pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
|
||||
"pageserver_page_service_batch_size_sum"
|
||||
).value,
|
||||
pageserver_vectored_get_count=pageserver_metrics.query_one(
|
||||
"pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
|
||||
pageserver_batch_size_histo_count=pageserver_metrics.query_one(
|
||||
"pageserver_page_service_batch_size_count"
|
||||
).value,
|
||||
compute_getpage_count=compute_getpage_count,
|
||||
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
|
||||
@@ -243,7 +243,7 @@ def test_throughput(
|
||||
# Sanity-checks on the collected data
|
||||
#
|
||||
# assert that getpage counts roughly match between compute and ps
|
||||
assert metrics.pageserver_getpage_count == pytest.approx(
|
||||
assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
|
||||
metrics.compute_getpage_count, rel=0.01
|
||||
)
|
||||
|
||||
@@ -256,7 +256,7 @@ def test_throughput(
|
||||
|
||||
zenbenchmark.record(
|
||||
"perfmetric.batching_factor",
|
||||
metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
|
||||
metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
|
||||
unit="",
|
||||
report=MetricReport.HIGHER_IS_BETTER,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user