page_service: batching observability & include throttled time in smgr metrics (#9870)

This PR 

- fixes smgr metrics https://github.com/neondatabase/neon/issues/9925 
- adds an additional startup log line logging the current batching
config
- adds a histogram of batch sizes global and per-tenant
- adds a metric exposing the current batching config

The issue described #9925 is that before this PR, request latency was
only observed *after* batching.
This means that smgr latency metrics (most importantly getpage latency)
don't account for
- `wait_lsn` time 
- time spent waiting for batch to fill up / the executor stage to pick
up the batch.

The fix is to use a per-request batching timer, like we did before the
initial batching PR.
We funnel those timers through the entire request lifecycle.

I noticed that even before the initial batching changes, we weren't
accounting for the time spent writing & flushing the response to the
wire.
This PR drive-by fixes that deficiency by dropping the timers at the
very end of processing the batch, i.e., after the `pgb.flush()` call.

I was **unable to maintain the behavior that we deduct
time-spent-in-throttle from various latency metrics.
The reason is that we're using a *single* counter in `RequestContext` to
track micros spent in throttle.
But there are *N* metrics timers in the batch, one per request.
As a consequence, the practice of consuming the counter in the drop
handler of each timer no longer works because all but the first timer
will encounter error `close() called on closed state`.
A failed attempt to maintain the current behavior can be found in
https://github.com/neondatabase/neon/pull/9951.

So, this PR remvoes the deduction behavior from all metrics.
I started a discussion on Slack about it the implications this has for
our internal SLO calculation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029

# Refs

- fixes https://github.com/neondatabase/neon/issues/9925
- sub-issue https://github.com/neondatabase/neon/issues/9377
- epic: https://github.com/neondatabase/neon/issues/9376
This commit is contained in:
Christian Schwarz
2024-12-03 12:03:23 +01:00
committed by Ivan Efremov
parent 8963ac85f9
commit dd76f1eeee
11 changed files with 373 additions and 396 deletions

View File

@@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
counter("pageserver_tenant_throttling_count_accounted_finish"),
counter("pageserver_tenant_throttling_wait_usecs_sum"),
counter("pageserver_tenant_throttling_count"),
*histogram("pageserver_page_service_batch_size"),
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold
# "pageserver_broken_tenants_count" -- used only for broken

View File

@@ -167,18 +167,18 @@ def test_throughput(
@dataclass
class Metrics:
time: float
pageserver_getpage_count: float
pageserver_vectored_get_count: float
pageserver_batch_size_histo_sum: float
pageserver_batch_size_histo_count: float
compute_getpage_count: float
pageserver_cpu_seconds_total: float
def __sub__(self, other: "Metrics") -> "Metrics":
return Metrics(
time=self.time - other.time,
pageserver_getpage_count=self.pageserver_getpage_count
- other.pageserver_getpage_count,
pageserver_vectored_get_count=self.pageserver_vectored_get_count
- other.pageserver_vectored_get_count,
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
- other.pageserver_batch_size_histo_sum,
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
- other.pageserver_batch_size_histo_count,
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
- other.pageserver_cpu_seconds_total,
@@ -187,8 +187,8 @@ def test_throughput(
def normalize(self, by) -> "Metrics":
return Metrics(
time=self.time / by,
pageserver_getpage_count=self.pageserver_getpage_count / by,
pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
compute_getpage_count=self.compute_getpage_count / by,
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
)
@@ -202,11 +202,11 @@ def test_throughput(
pageserver_metrics = ps_http.get_metrics()
return Metrics(
time=time.time(),
pageserver_getpage_count=pageserver_metrics.query_one(
"pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
"pageserver_page_service_batch_size_sum"
).value,
pageserver_vectored_get_count=pageserver_metrics.query_one(
"pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
pageserver_batch_size_histo_count=pageserver_metrics.query_one(
"pageserver_page_service_batch_size_count"
).value,
compute_getpage_count=compute_getpage_count,
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
@@ -243,7 +243,7 @@ def test_throughput(
# Sanity-checks on the collected data
#
# assert that getpage counts roughly match between compute and ps
assert metrics.pageserver_getpage_count == pytest.approx(
assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
metrics.compute_getpage_count, rel=0.01
)
@@ -256,7 +256,7 @@ def test_throughput(
zenbenchmark.record(
"perfmetric.batching_factor",
metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
unit="",
report=MetricReport.HIGHER_IS_BETTER,
)

View File

@@ -4,6 +4,7 @@ import copy
import json
import uuid
import pytest
from anyio import Path
from fixtures.common_types import TenantId, TimelineId
from fixtures.log_helper import log
@@ -70,14 +71,21 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
log.info("warmup / make sure metrics are present")
run_pagebench_at_max_speed_and_get_total_requests_completed(2)
metrics_query = {
smgr_metrics_query = {
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"smgr_query_type": "get_page_at_lsn",
}
metric_name = "pageserver_smgr_query_seconds_sum"
smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
smgr_metric_name = "pageserver_smgr_query_seconds_sum"
throttle_metrics_query = {
"tenant_id": str(tenant_id),
}
throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
assert smgr_query_seconds_pre is not None
throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
assert throttled_usecs_pre is not None
marker = uuid.uuid4().hex
ps_http.post_tracing_event("info", marker)
@@ -108,14 +116,23 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
timeout=compaction_period,
)
log.info("validate that the metric doesn't include throttle wait time")
smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
log.info("the smgr metric includes throttle time")
smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
assert smgr_query_seconds_post is not None
throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
assert throttled_usecs_post is not None
actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
actual_throttled_secs = actual_throttled_usecs / 1_000_000
assert (
duration_secs >= 10 * actual_smgr_query_seconds
), "smgr metrics should not include throttle wait time"
pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
), "smgr metrics include throttle wait time"
smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
assert smgr_ex_throttle > 0
assert (
duration_secs > 10 * smgr_ex_throttle
), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
throttle_config_with_field_fair_set = {