From 5e0409de95ed1d19ffdb36c31b12792c49938635 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 25 Jul 2024 15:45:15 -0500 Subject: [PATCH] Fix negative replication delay metric In some cases, we can get a negative metric for replication_delay_bytes. My best guess from all the research I've done is that we evaluate pg_last_wal_receive_lsn() before pg_last_wal_replay_lsn(), and that by the time everything is said and done, the replay LSN has advanced past the receive LSN. In this case, our lag can effectively be modeled as 0 due to the speed of the WAL reception and replay. --- vm-image-spec.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 2767710bad..7d005c7139 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -277,8 +277,12 @@ files: help: 'Bytes between received and replayed LSN' key_labels: values: [replication_delay_bytes] + # We use a GREATEST call here because this calculation can be negative. + # The calculation is not atomic, meaning after we've gotten the receive + # LSN, the replay LSN may have advanced past the receive LSN we + # are using for the calculation. query: | - SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - metric_name: replication_delay_seconds type: gauge