Files
neon/test_runner/regress/test_hadron_ps_connectivity_metrics.py
Tristan Partin 512210bb5a [BRC-2368] Add PS and compute_ctl metrics to report pagestream request errors (#12716)
## Problem

In our experience running the system so far, almost all of the "hang
compute" situations are due to the compute (postgres) pointing at the
wrong pageservers. We currently mainly rely on the promethesus exporter
(PGExporter) running on PG to detect and report any down time, but these
can be unreliable because the read and write probes the PGExporter runs
do not always generate pageserver requests due to caching, even though
the real user might be experiencing down time when touching uncached
pages.

We are also about to start disk-wiping node pool rotation operations in
prod clusters for our pageservers, and it is critical to have a
convenient way to monitor the impact of these node pool rotations so
that we can quickly respond to any issues. These metrics should provide
very clear signals to address this operational need.

## Summary of changes

Added a pair of metrics to detect issues between postgres' PageStream
protocol (e.g. get_page_at_lsn, get_base_backup, etc.) communications
with pageservers:
* On the compute node (compute_ctl), exports a counter metric that is
incremented every time postgres requests a configuration refresh.
Postgres today only requests these configuration refreshes when it
cannot connect to a pageserver or if the pageserver rejects its request
by disconnecting.
* On the pageserver, exports a counter metric that is incremented every
time it receives a PageStream request that cannot be handled because the
tenant is not known or if the request was routed to the wrong shard
(e.g. secondary).

### How I plan to use metrics
I plan to use the metrics added here to create alerts. The alerts can
fire, for example, if these counters have been continuously increasing
for over a certain period of time. During rollouts, misrouted requests
may occasionally happen, but they should soon die down as
reconfigurations make progress. We can start with something like raising
the alert if the counters have been increasing continuously for over 5
minutes.

## How is this tested?

New integration tests in
`test_runner/regress/test_hadron_ps_connectivity_metrics.py`

Co-authored-by: William Huang <william.huang@databricks.com>
2025-07-24 19:05:00 +00:00

125 lines
5.3 KiB
Python

import json
import shutil
from fixtures.common_types import TenantShardId
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
# Helper function to attempt reconfiguration of the compute to point to a new pageserver. Note that in these tests,
# we don't expect the reconfiguration attempts to go through, as we will be pointing the compute at a "wrong" pageserver.
def _attempt_reconfiguration(endpoint: Endpoint, new_pageserver_id: int, timeout_sec: float):
try:
endpoint.reconfigure(pageserver_id=new_pageserver_id, timeout_sec=timeout_sec)
except Exception as e:
log.info(f"reconfiguration failed with exception {e}")
pass
def read_misrouted_metric_value(pageserver: NeonPageserver) -> float:
return (
pageserver.http_client()
.get_metrics()
.query_one("pageserver_misrouted_pagestream_requests_total")
.value
)
def read_request_error_metric_value(endpoint: Endpoint) -> float:
return (
parse_metrics(endpoint.http_client().metrics())
.query_one("pg_cctl_pagestream_request_errors_total")
.value
)
def test_misrouted_to_secondary(
neon_env_builder: NeonEnvBuilder,
):
"""
Tests that the following metrics are incremented when compute tries to talk to a secondary pageserver:
- On pageserver receiving the request: pageserver_misrouted_pagestream_requests_total
- On compute: pg_cctl_pagestream_request_errors_total
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.broker.start()
env.storage_controller.start()
for ps in env.pageservers:
ps.start()
for sk in env.safekeepers:
sk.start()
# Create a tenant that has one primary and one secondary. Due to primary/secondary placement constraints,
# the primary and secondary pageservers will be different.
tenant_id, _ = env.create_tenant(shard_count=1, placement_policy=json.dumps({"Attached": 1}))
endpoint = env.endpoints.create(
"main", tenant_id=tenant_id, config_lines=["neon.lakebase_mode = true"]
)
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
# Get the primary pageserver serving the zero shard of the tenant, and detach it from the primary pageserver.
# This test operation configures tenant directly on the pageserver/does not go through the storage controller,
# so the compute does not get any notifications and will keep pointing at the detached pageserver.
tenant_zero_shard = TenantShardId(tenant_id, shard_number=0, shard_count=1)
primary_ps = env.get_tenant_pageserver(tenant_zero_shard)
secondary_ps = (
env.pageservers[1] if primary_ps.id == env.pageservers[0].id else env.pageservers[0]
)
# Now try to point the compute at the pageserver that is acting as secondary for the tenant. Test that the metrics
# on both compute_ctl and the pageserver register the misrouted requests following the reconfiguration attempt.
assert read_misrouted_metric_value(secondary_ps) == 0
assert read_request_error_metric_value(endpoint) == 0
_attempt_reconfiguration(endpoint, new_pageserver_id=secondary_ps.id, timeout_sec=2.0)
assert read_misrouted_metric_value(secondary_ps) > 0, "PS metric not incremented"
assert read_request_error_metric_value(endpoint) > 0, "compute_ctl metric not incremented"
def test_misrouted_to_ps_not_hosting_tenant(
neon_env_builder: NeonEnvBuilder,
):
"""
Tests that the following metrics are incremented when compute tries to talk to a pageserver that does not host the tenant:
- On pageserver receiving the request: pageserver_misrouted_pagestream_requests_total
- On compute: pg_cctl_pagestream_request_errors_total
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.broker.start()
env.storage_controller.start(handle_ps_local_disk_loss=False)
for ps in env.pageservers:
ps.start()
for sk in env.safekeepers:
sk.start()
tenant_id, _ = env.create_tenant(shard_count=1)
endpoint = env.endpoints.create(
"main", tenant_id=tenant_id, config_lines=["neon.lakebase_mode = true"]
)
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
tenant_ps_id = env.get_tenant_pageserver(
TenantShardId(tenant_id, shard_number=0, shard_count=1)
).id
non_hosting_ps = (
env.pageservers[1] if tenant_ps_id == env.pageservers[0].id else env.pageservers[0]
)
# Clear the disk of the non-hosting PS to make sure that it indeed doesn't have any information about the tenant.
non_hosting_ps.stop(immediate=True)
shutil.rmtree(non_hosting_ps.tenant_dir())
non_hosting_ps.start()
# Now try to point the compute to the non-hosting pageserver. Test that the metrics
# on both compute_ctl and the pageserver register the misrouted requests following the reconfiguration attempt.
assert read_misrouted_metric_value(non_hosting_ps) == 0
assert read_request_error_metric_value(endpoint) == 0
_attempt_reconfiguration(endpoint, new_pageserver_id=non_hosting_ps.id, timeout_sec=2.0)
assert read_misrouted_metric_value(non_hosting_ps) > 0, "PS metric not incremented"
assert read_request_error_metric_value(endpoint) > 0, "compute_ctl metric not incremented"