From 6f3e043a76dd47a18180eec627ad5f5bbade6186 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 20 May 2024 17:00:47 -0700 Subject: [PATCH] Add some more replication slot metrics (#7761) ## Problem We want to add alerts for when people's replication slots break, and also metrics for retained WAL so that we can make warn customers when their storage gets bloated. ## Summary of changes Adds the metrics. Addresses https://github.com/neondatabase/neon/issues/7593 --- vm-image-spec.yaml | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index fa7cd014bf..0f9d56e466 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -254,8 +254,8 @@ files: select case when pg_catalog.pg_is_in_recovery() - then pg_last_wal_replay_lsn() - else pg_current_wal_lsn() + then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + else (pg_current_wal_lsn() - '0/0')::FLOAT8 end as lsn; - metric_name: replication_delay_bytes @@ -294,6 +294,9 @@ files: query: | SELECT checkpoints_timed FROM pg_stat_bgwriter; + # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. + # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. + # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. - metric_name: logical_slot_restart_lsn type: gauge @@ -302,7 +305,32 @@ files: - slot_name values: [restart_lsn] query: | - select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical'; + select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical'; + + - metric_name: retained_wal + type: gauge + help: 'Retained WAL in inactive replication slots' + key_labels: + - slot_name + values: [retained_wal] + query: | + SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal + FROM pg_replication_slots + WHERE active = false; + + - metric_name: wal_is_lost + type: gauge + help: 'Whether or not the replication slot\'s wal_status is lost' + key_labels: + - slot_name + values: [wal_status_is_lost] + query: | + SELECT slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_status_is_lost + FROM pg_replication_slots; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling