pageserver: add metrics for get page batch breaking reasons (#11545)

## Problem

https://github.com/neondatabase/neon/pull/11494 changes the batching
logic, but we don't have a way to evaluate it.

## Summary of changes

This PR introduces a global and per timeline metric which tracks the
reason for
which a batch was broken.
This commit is contained in:
Vlad Lazar
2025-04-14 14:24:47 +01:00
committed by GitHub
parent daebe50e19
commit 148b3701cf
4 changed files with 309 additions and 134 deletions

View File

@@ -1,7 +1,6 @@
import concurrent.futures
import dataclasses
import json
import re
import threading
import time
from dataclasses import dataclass
@@ -170,6 +169,7 @@ def test_throughput(
time: float
pageserver_batch_size_histo_sum: float
pageserver_batch_size_histo_count: float
pageserver_batch_breaks_reason_count: dict[str, int]
compute_getpage_count: float
pageserver_cpu_seconds_total: float
@@ -183,6 +183,10 @@ def test_throughput(
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
- other.pageserver_cpu_seconds_total,
pageserver_batch_breaks_reason_count={
reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0)
for reason, count in self.pageserver_batch_breaks_reason_count.items()
},
)
def normalize(self, by) -> "Metrics":
@@ -192,6 +196,10 @@ def test_throughput(
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
compute_getpage_count=self.compute_getpage_count / by,
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
pageserver_batch_breaks_reason_count={
reason: count / by
for reason, count in self.pageserver_batch_breaks_reason_count.items()
},
)
def get_metrics() -> Metrics:
@@ -201,6 +209,20 @@ def test_throughput(
)
compute_getpage_count = cur.fetchall()[0][0]
pageserver_metrics = ps_http.get_metrics()
for name, samples in pageserver_metrics.metrics.items():
for sample in samples:
log.info(f"{name=} labels={sample.labels} {sample.value}")
raw_batch_break_reason_count = pageserver_metrics.query_all(
"pageserver_page_service_batch_break_reason_total",
filter={"timeline_id": str(env.initial_timeline)},
)
batch_break_reason_count = {
sample.labels["reason"]: int(sample.value)
for sample in raw_batch_break_reason_count
}
return Metrics(
time=time.time(),
pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
@@ -209,6 +231,7 @@ def test_throughput(
pageserver_batch_size_histo_count=pageserver_metrics.query_one(
"pageserver_page_service_batch_size_count"
).value,
pageserver_batch_breaks_reason_count=batch_break_reason_count,
compute_getpage_count=compute_getpage_count,
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
"libmetrics_process_cpu_seconds_highres"
@@ -263,25 +286,6 @@ def test_throughput(
log.info("Results: %s", metrics)
since_last_start: list[str] = []
for line in env.pageserver.logfile.read_text().splitlines():
if "git:" in line:
since_last_start = []
since_last_start.append(line)
stopping_batching_because_re = re.compile(
r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
)
reasons_for_stopping_batching = {}
for line in since_last_start:
match = stopping_batching_because_re.search(line)
if match:
if match.group(1) not in reasons_for_stopping_batching:
reasons_for_stopping_batching[match.group(1)] = 0
reasons_for_stopping_batching[match.group(1)] += 1
log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
#
# Sanity-checks on the collected data
#
@@ -295,7 +299,16 @@ def test_throughput(
#
for metric, value in dataclasses.asdict(metrics).items():
zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
if metric == "pageserver_batch_breaks_reason_count":
assert isinstance(value, dict)
for reason, count in value.items():
zenbenchmark.record(
f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM
)
else:
zenbenchmark.record(
f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM
)
zenbenchmark.record(
"perfmetric.batching_factor",