From 4b77807de9bc2ea550aff812303f44b71e64aefd Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 6 Mar 2025 16:32:17 +0100 Subject: [PATCH] fix(compute/sql_exporter): Ignore invalid DBs when collecting size (#11097) ## Problem Original Slack discussion: https://neondb.slack.com/archives/C04DGM6SMTM/p1739915430147169 TL;DR in Postgres, it's totally normal to have 'invalid' DBs (state after the interrupted `DROP DATABASE`). Yet, some of our metrics collected with `sql_exporter` try to get the size of such invalid DBs. Typical log lines: ``` time=2025-03-05T16:30:32.368Z level=ERROR source=promhttp.go:52 msg="Error gathering metrics" error="[from Gatherer #1] [collector=neon_collector,query=pg_stats_userdb] pq: [NEON_SMGR] [reqid 0] could not read db size of db 173228 from page server at lsn 0/44A0E8C0" time=2025-03-05T16:30:32.369Z level=ERROR source=promhttp.go:52 msg="Error gathering metrics" error="[from Gatherer #1] [collector=neon_collector,query=db_total_size] pq: [NEON_SMGR] [reqid 0] could not read db size of db 173228 from page server at lsn 0/44A0E8C0" ``` ## Summary of changes Ignore invalid DBs in these two metrics -- `pg_stats_userdb` and `db_total_size` --- compute/etc/sql_exporter/db_total_size.sql | 6 +++++- compute/etc/sql_exporter/pg_stats_userdb.sql | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index 9cbbdfd8a3..fe0360ab5c 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total FROM pg_database; +SELECT sum(pg_database_size(datname)) AS total +FROM pg_database +-- Ignore invalid databases, as we will likely have problems with +-- getting their size from the Pageserver. +WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 00ada87370..12e6c4ae59 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,10 +1,20 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, - tup_updated AS updated, tup_deleted AS deleted, datname +SELECT pg_database_size(datname) AS db_size, + deadlocks, + tup_inserted AS inserted, + tup_updated AS updated, + tup_deleted AS deleted, + datname FROM pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database - WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 + -- Ignore invalid databases, as we will likely have problems with + -- getting their size from the Pageserver. + WHERE datconnlimit != -2 + AND datname <> 'postgres' + AND NOT datistemplate + ORDER BY oid + LIMIT 10 );