mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 19:42:55 +00:00
## Problem In our experience running the system so far, almost all of the "hang compute" situations are due to the compute (postgres) pointing at the wrong pageservers. We currently mainly rely on the promethesus exporter (PGExporter) running on PG to detect and report any down time, but these can be unreliable because the read and write probes the PGExporter runs do not always generate pageserver requests due to caching, even though the real user might be experiencing down time when touching uncached pages. We are also about to start disk-wiping node pool rotation operations in prod clusters for our pageservers, and it is critical to have a convenient way to monitor the impact of these node pool rotations so that we can quickly respond to any issues. These metrics should provide very clear signals to address this operational need. ## Summary of changes Added a pair of metrics to detect issues between postgres' PageStream protocol (e.g. get_page_at_lsn, get_base_backup, etc.) communications with pageservers: * On the compute node (compute_ctl), exports a counter metric that is incremented every time postgres requests a configuration refresh. Postgres today only requests these configuration refreshes when it cannot connect to a pageserver or if the pageserver rejects its request by disconnecting. * On the pageserver, exports a counter metric that is incremented every time it receives a PageStream request that cannot be handled because the tenant is not known or if the request was routed to the wrong shard (e.g. secondary). ### How I plan to use metrics I plan to use the metrics added here to create alerts. The alerts can fire, for example, if these counters have been continuously increasing for over a certain period of time. During rollouts, misrouted requests may occasionally happen, but they should soon die down as reconfigurations make progress. We can start with something like raising the alert if the counters have been increasing continuously for over 5 minutes. ## How is this tested? New integration tests in `test_runner/regress/test_hadron_ps_connectivity_metrics.py` Co-authored-by: William Huang <william.huang@databricks.com>
125 lines
5.3 KiB
Python
125 lines
5.3 KiB
Python
import json
|
|
import shutil
|
|
|
|
from fixtures.common_types import TenantShardId
|
|
from fixtures.log_helper import log
|
|
from fixtures.metrics import parse_metrics
|
|
from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
|
|
|
|
|
|
# Helper function to attempt reconfiguration of the compute to point to a new pageserver. Note that in these tests,
|
|
# we don't expect the reconfiguration attempts to go through, as we will be pointing the compute at a "wrong" pageserver.
|
|
def _attempt_reconfiguration(endpoint: Endpoint, new_pageserver_id: int, timeout_sec: float):
|
|
try:
|
|
endpoint.reconfigure(pageserver_id=new_pageserver_id, timeout_sec=timeout_sec)
|
|
except Exception as e:
|
|
log.info(f"reconfiguration failed with exception {e}")
|
|
pass
|
|
|
|
|
|
def read_misrouted_metric_value(pageserver: NeonPageserver) -> float:
|
|
return (
|
|
pageserver.http_client()
|
|
.get_metrics()
|
|
.query_one("pageserver_misrouted_pagestream_requests_total")
|
|
.value
|
|
)
|
|
|
|
|
|
def read_request_error_metric_value(endpoint: Endpoint) -> float:
|
|
return (
|
|
parse_metrics(endpoint.http_client().metrics())
|
|
.query_one("pg_cctl_pagestream_request_errors_total")
|
|
.value
|
|
)
|
|
|
|
|
|
def test_misrouted_to_secondary(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
Tests that the following metrics are incremented when compute tries to talk to a secondary pageserver:
|
|
- On pageserver receiving the request: pageserver_misrouted_pagestream_requests_total
|
|
- On compute: pg_cctl_pagestream_request_errors_total
|
|
"""
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_configs()
|
|
env.broker.start()
|
|
env.storage_controller.start()
|
|
for ps in env.pageservers:
|
|
ps.start()
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
# Create a tenant that has one primary and one secondary. Due to primary/secondary placement constraints,
|
|
# the primary and secondary pageservers will be different.
|
|
tenant_id, _ = env.create_tenant(shard_count=1, placement_policy=json.dumps({"Attached": 1}))
|
|
endpoint = env.endpoints.create(
|
|
"main", tenant_id=tenant_id, config_lines=["neon.lakebase_mode = true"]
|
|
)
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
|
|
# Get the primary pageserver serving the zero shard of the tenant, and detach it from the primary pageserver.
|
|
# This test operation configures tenant directly on the pageserver/does not go through the storage controller,
|
|
# so the compute does not get any notifications and will keep pointing at the detached pageserver.
|
|
tenant_zero_shard = TenantShardId(tenant_id, shard_number=0, shard_count=1)
|
|
|
|
primary_ps = env.get_tenant_pageserver(tenant_zero_shard)
|
|
secondary_ps = (
|
|
env.pageservers[1] if primary_ps.id == env.pageservers[0].id else env.pageservers[0]
|
|
)
|
|
|
|
# Now try to point the compute at the pageserver that is acting as secondary for the tenant. Test that the metrics
|
|
# on both compute_ctl and the pageserver register the misrouted requests following the reconfiguration attempt.
|
|
assert read_misrouted_metric_value(secondary_ps) == 0
|
|
assert read_request_error_metric_value(endpoint) == 0
|
|
_attempt_reconfiguration(endpoint, new_pageserver_id=secondary_ps.id, timeout_sec=2.0)
|
|
assert read_misrouted_metric_value(secondary_ps) > 0, "PS metric not incremented"
|
|
assert read_request_error_metric_value(endpoint) > 0, "compute_ctl metric not incremented"
|
|
|
|
|
|
def test_misrouted_to_ps_not_hosting_tenant(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
Tests that the following metrics are incremented when compute tries to talk to a pageserver that does not host the tenant:
|
|
- On pageserver receiving the request: pageserver_misrouted_pagestream_requests_total
|
|
- On compute: pg_cctl_pagestream_request_errors_total
|
|
"""
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_configs()
|
|
env.broker.start()
|
|
env.storage_controller.start(handle_ps_local_disk_loss=False)
|
|
for ps in env.pageservers:
|
|
ps.start()
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
tenant_id, _ = env.create_tenant(shard_count=1)
|
|
endpoint = env.endpoints.create(
|
|
"main", tenant_id=tenant_id, config_lines=["neon.lakebase_mode = true"]
|
|
)
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
|
|
tenant_ps_id = env.get_tenant_pageserver(
|
|
TenantShardId(tenant_id, shard_number=0, shard_count=1)
|
|
).id
|
|
non_hosting_ps = (
|
|
env.pageservers[1] if tenant_ps_id == env.pageservers[0].id else env.pageservers[0]
|
|
)
|
|
|
|
# Clear the disk of the non-hosting PS to make sure that it indeed doesn't have any information about the tenant.
|
|
non_hosting_ps.stop(immediate=True)
|
|
shutil.rmtree(non_hosting_ps.tenant_dir())
|
|
non_hosting_ps.start()
|
|
|
|
# Now try to point the compute to the non-hosting pageserver. Test that the metrics
|
|
# on both compute_ctl and the pageserver register the misrouted requests following the reconfiguration attempt.
|
|
assert read_misrouted_metric_value(non_hosting_ps) == 0
|
|
assert read_request_error_metric_value(endpoint) == 0
|
|
_attempt_reconfiguration(endpoint, new_pageserver_id=non_hosting_ps.id, timeout_sec=2.0)
|
|
assert read_misrouted_metric_value(non_hosting_ps) > 0, "PS metric not incremented"
|
|
assert read_request_error_metric_value(endpoint) > 0, "compute_ctl metric not incremented"
|