page_service: batching observability & include throttled time in smgr metrics (#9870)

This PR - fixes smgr metrics https://github.com/neondatabase/neon/issues/9925 - adds an additional startup log line logging the current batching config - adds a histogram of batch sizes global and per-tenant - adds a metric exposing the current batching config The issue described #9925 is that before this PR, request latency was only observed *after* batching. This means that smgr latency metrics (most importantly getpage latency) don't account for - `wait_lsn` time - time spent waiting for batch to fill up / the executor stage to pick up the batch. The fix is to use a per-request batching timer, like we did before the initial batching PR. We funnel those timers through the entire request lifecycle. I noticed that even before the initial batching changes, we weren't accounting for the time spent writing & flushing the response to the wire. This PR drive-by fixes that deficiency by dropping the timers at the very end of processing the batch, i.e., after the `pgb.flush()` call. I was **unable to maintain the behavior that we deduct time-spent-in-throttle from various latency metrics. The reason is that we're using a *single* counter in `RequestContext` to track micros spent in throttle. But there are *N* metrics timers in the batch, one per request. As a consequence, the practice of consuming the counter in the drop handler of each timer no longer works because all but the first timer will encounter error `close() called on closed state`. A failed attempt to maintain the current behavior can be found in https://github.com/neondatabase/neon/pull/9951. So, this PR remvoes the deduction behavior from all metrics. I started a discussion on Slack about it the implications this has for our internal SLO calculation: https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029 # Refs - fixes https://github.com/neondatabase/neon/issues/9925 - sub-issue https://github.com/neondatabase/neon/issues/9377 - epic: https://github.com/neondatabase/neon/issues/9376
2026-07-07 06:00:38 +00:00 · 2024-12-03 12:03:23 +01:00
parent 8963ac85f9
commit dd76f1eeee
11 changed files with 373 additions and 396 deletions
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    counter("pageserver_tenant_throttling_count_accounted_finish"),
    counter("pageserver_tenant_throttling_wait_usecs_sum"),
    counter("pageserver_tenant_throttling_count"),
+    *histogram("pageserver_page_service_batch_size"),
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
    # "pageserver_directory_entries_count", -- only used if above a certain threshold
    # "pageserver_broken_tenants_count" -- used only for broken
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -167,18 +167,18 @@ def test_throughput(
    @dataclass
    class Metrics:
        time: float
-        pageserver_getpage_count: float
-        pageserver_vectored_get_count: float
+        pageserver_batch_size_histo_sum: float
+        pageserver_batch_size_histo_count: float
        compute_getpage_count: float
        pageserver_cpu_seconds_total: float

        def __sub__(self, other: "Metrics") -> "Metrics":
            return Metrics(
                time=self.time - other.time,
-                pageserver_getpage_count=self.pageserver_getpage_count
-                - other.pageserver_getpage_count,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count
-                - other.pageserver_vectored_get_count,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
+                - other.pageserver_batch_size_histo_sum,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
+                - other.pageserver_batch_size_histo_count,
                compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                - other.pageserver_cpu_seconds_total,
@@ -187,8 +187,8 @@ def test_throughput(
        def normalize(self, by) -> "Metrics":
            return Metrics(
                time=self.time / by,
-                pageserver_getpage_count=self.pageserver_getpage_count / by,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                compute_getpage_count=self.compute_getpage_count / by,
                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
            )
@@ -202,11 +202,11 @@ def test_throughput(
            pageserver_metrics = ps_http.get_metrics()
            return Metrics(
                time=time.time(),
-                pageserver_getpage_count=pageserver_metrics.query_one(
-                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_sum"
                ).value,
-                pageserver_vectored_get_count=pageserver_metrics.query_one(
-                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                pageserver_batch_size_histo_count=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_count"
                ).value,
                compute_getpage_count=compute_getpage_count,
                pageserver_cpu_seconds_total=pageserver_metrics.query_one(
@@ -243,7 +243,7 @@ def test_throughput(
    # Sanity-checks on the collected data
    #
    # assert that getpage counts roughly match between compute and ps
-    assert metrics.pageserver_getpage_count == pytest.approx(
+    assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
        metrics.compute_getpage_count, rel=0.01
    )

@@ -256,7 +256,7 @@ def test_throughput(

    zenbenchmark.record(
        "perfmetric.batching_factor",
-        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
        unit="",
        report=MetricReport.HIGHER_IS_BETTER,
    )
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -4,6 +4,7 @@ import copy
 import json
 import uuid

+import pytest
 from anyio import Path
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
@@ -70,14 +71,21 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P

    log.info("warmup / make sure metrics are present")
    run_pagebench_at_max_speed_and_get_total_requests_completed(2)
-    metrics_query = {
+    smgr_metrics_query = {
        "tenant_id": str(tenant_id),
        "timeline_id": str(timeline_id),
        "smgr_query_type": "get_page_at_lsn",
    }
-    metric_name = "pageserver_smgr_query_seconds_sum"
-    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    smgr_metric_name = "pageserver_smgr_query_seconds_sum"
+    throttle_metrics_query = {
+        "tenant_id": str(tenant_id),
+    }
+    throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
+
+    smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
    assert smgr_query_seconds_pre is not None
+    throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_pre is not None

    marker = uuid.uuid4().hex
    ps_http.post_tracing_event("info", marker)
@@ -108,14 +116,23 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
        timeout=compaction_period,
    )

-    log.info("validate that the metric doesn't include throttle wait time")
-    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    log.info("the smgr metric includes throttle time")
+    smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
    assert smgr_query_seconds_post is not None
+    throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_post is not None
    actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+    actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
+    actual_throttled_secs = actual_throttled_usecs / 1_000_000

    assert (
-        duration_secs >= 10 * actual_smgr_query_seconds
-    ), "smgr metrics should not include throttle wait time"
+        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
+    ), "smgr metrics include throttle wait time"
+    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
+    assert smgr_ex_throttle > 0
+    assert (
+        duration_secs > 10 * smgr_ex_throttle
+    ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"


 throttle_config_with_field_fair_set = {