From 6f3e043a76dd47a18180eec627ad5f5bbade6186 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 20 May 2024 17:00:47 -0700
Subject: [PATCH] Add some more replication slot metrics (#7761)

## Problem
We want to add alerts for when people's replication slots break, and
also metrics for retained WAL so that we can make warn customers when
their storage gets bloated.

## Summary of changes
Adds the metrics. Addresses
https://github.com/neondatabase/neon/issues/7593
---
 vm-image-spec.yaml | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index fa7cd014bf..0f9d56e466 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -254,8 +254,8 @@ files:
           select
             case
               when pg_catalog.pg_is_in_recovery()
-              then pg_last_wal_replay_lsn()
-              else pg_current_wal_lsn()
+              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+              else (pg_current_wal_lsn() - '0/0')::FLOAT8
             end as lsn;
 
       - metric_name: replication_delay_bytes
@@ -294,6 +294,9 @@ files:
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
+      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
       # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
       - metric_name: logical_slot_restart_lsn
         type: gauge
@@ -302,7 +305,32 @@ files:
           - slot_name
         values: [restart_lsn]
         query: |
-          select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical';
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';
+
+      - metric_name: retained_wal
+        type: gauge
+        help: 'Retained WAL in inactive replication slots'
+        key_labels:
+          - slot_name
+        values: [retained_wal]
+        query: |
+          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+          FROM pg_replication_slots
+          WHERE active = false;
+
+      - metric_name: wal_is_lost
+        type: gauge
+        help: 'Whether or not the replication slot\'s wal_status is lost'
+        key_labels:
+          - slot_name
+        values: [wal_status_is_lost]
+        query: |
+          SELECT slot_name,
+          CASE
+            WHEN wal_status = 'lost' THEN 1
+            ELSE 0
+          END AS wal_status_is_lost
+          FROM pg_replication_slots;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling