mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
Per convention, histogram buckets have the '_bucket' suffix. I got that
wrong in commit 0d500bbd5b.
Fixes https://github.com/neondatabase/neon/issues/9241
332 lines
10 KiB
YAML
332 lines
10 KiB
YAML
collector_name: neon_collector
|
|
metrics:
|
|
- metric_name: lfc_misses
|
|
type: gauge
|
|
help: 'lfc_misses'
|
|
key_labels:
|
|
values: [lfc_misses]
|
|
query: |
|
|
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
|
|
|
- metric_name: lfc_used
|
|
type: gauge
|
|
help: 'LFC chunks used (chunk = 1MB)'
|
|
key_labels:
|
|
values: [lfc_used]
|
|
query: |
|
|
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
|
|
|
- metric_name: lfc_hits
|
|
type: gauge
|
|
help: 'lfc_hits'
|
|
key_labels:
|
|
values: [lfc_hits]
|
|
query: |
|
|
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
|
|
|
- metric_name: lfc_writes
|
|
type: gauge
|
|
help: 'lfc_writes'
|
|
key_labels:
|
|
values: [lfc_writes]
|
|
query: |
|
|
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
|
|
|
- metric_name: lfc_cache_size_limit
|
|
type: gauge
|
|
help: 'LFC cache size limit in bytes'
|
|
key_labels:
|
|
values: [lfc_cache_size_limit]
|
|
query: |
|
|
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
|
|
|
- metric_name: connection_counts
|
|
type: gauge
|
|
help: 'Connection counts'
|
|
key_labels:
|
|
- datname
|
|
- state
|
|
values: [count]
|
|
query: |
|
|
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
|
|
|
- metric_name: pg_stats_userdb
|
|
type: gauge
|
|
help: 'Stats for several oldest non-system dbs'
|
|
key_labels:
|
|
- datname
|
|
value_label: kind
|
|
values:
|
|
- db_size
|
|
- deadlocks
|
|
# Rows
|
|
- inserted
|
|
- updated
|
|
- deleted
|
|
# We export stats for 10 non-system database. Without this limit
|
|
# it is too easy to abuse the system by creating lots of databases.
|
|
query: |
|
|
select pg_database_size(datname) as db_size, deadlocks,
|
|
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
|
datname
|
|
from pg_stat_database
|
|
where datname IN (
|
|
select datname
|
|
from pg_database
|
|
where datname <> 'postgres' and not datistemplate
|
|
order by oid
|
|
limit 10
|
|
);
|
|
|
|
- metric_name: max_cluster_size
|
|
type: gauge
|
|
help: 'neon.max_cluster_size setting'
|
|
key_labels:
|
|
values: [max_cluster_size]
|
|
query: |
|
|
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
|
|
|
- metric_name: db_total_size
|
|
type: gauge
|
|
help: 'Size of all databases'
|
|
key_labels:
|
|
values: [total]
|
|
query: |
|
|
select sum(pg_database_size(datname)) as total from pg_database;
|
|
|
|
- metric_name: getpage_wait_seconds_count
|
|
type: counter
|
|
help: 'Number of getpage requests'
|
|
values: [getpage_wait_seconds_count]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: getpage_wait_seconds_sum
|
|
type: counter
|
|
help: 'Time spent in getpage requests'
|
|
values: [getpage_wait_seconds_sum]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: getpage_prefetch_requests_total
|
|
type: counter
|
|
help: 'Number of getpage issued for prefetching'
|
|
values: [getpage_prefetch_requests_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: getpage_sync_requests_total
|
|
type: counter
|
|
help: 'Number of synchronous getpage issued'
|
|
values: [getpage_sync_requests_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: getpage_prefetch_misses_total
|
|
type: counter
|
|
help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
|
|
values: [getpage_prefetch_misses_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: getpage_prefetch_discards_total
|
|
type: counter
|
|
help: 'Number of prefetch responses issued but not used'
|
|
values: [getpage_prefetch_discards_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: pageserver_requests_sent_total
|
|
type: counter
|
|
help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
|
|
values: [pageserver_requests_sent_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: pageserver_disconnects_total
|
|
type: counter
|
|
help: 'Number of times that the connection to the pageserver was lost'
|
|
values: [pageserver_disconnects_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: pageserver_send_flushes_total
|
|
type: counter
|
|
help: 'Number of flushes to the pageserver connection'
|
|
values: [pageserver_send_flushes_total]
|
|
query_ref: neon_perf_counters
|
|
|
|
- metric_name: getpage_wait_seconds_bucket
|
|
type: counter
|
|
help: 'Histogram buckets of getpage request latency'
|
|
key_labels:
|
|
- bucket_le
|
|
values: [value]
|
|
query_ref: getpage_wait_seconds_buckets
|
|
|
|
# DEPRECATED
|
|
- metric_name: lfc_approximate_working_set_size
|
|
type: gauge
|
|
help: 'Approximate working set size in pages of 8192 bytes'
|
|
key_labels:
|
|
values: [approximate_working_set_size]
|
|
query: |
|
|
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
|
|
|
- metric_name: lfc_approximate_working_set_size_windows
|
|
type: gauge
|
|
help: 'Approximate working set size in pages of 8192 bytes'
|
|
key_labels: [duration]
|
|
values: [size]
|
|
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
|
# of durations in a pretty-printed form.
|
|
query: |
|
|
select
|
|
x as duration,
|
|
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
|
from
|
|
(values ('5m'),('15m'),('1h')) as t (x);
|
|
|
|
- metric_name: compute_current_lsn
|
|
type: gauge
|
|
help: 'Current LSN of the database'
|
|
key_labels:
|
|
values: [lsn]
|
|
query: |
|
|
select
|
|
case
|
|
when pg_catalog.pg_is_in_recovery()
|
|
then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
|
else (pg_current_wal_lsn() - '0/0')::FLOAT8
|
|
end as lsn;
|
|
|
|
- metric_name: compute_receive_lsn
|
|
type: gauge
|
|
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
|
|
key_labels:
|
|
values: [lsn]
|
|
query: |
|
|
SELECT
|
|
CASE
|
|
WHEN pg_catalog.pg_is_in_recovery()
|
|
THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
|
ELSE 0
|
|
END AS lsn;
|
|
|
|
- metric_name: replication_delay_bytes
|
|
type: gauge
|
|
help: 'Bytes between received and replayed LSN'
|
|
key_labels:
|
|
values: [replication_delay_bytes]
|
|
# We use a GREATEST call here because this calculation can be negative.
|
|
# The calculation is not atomic, meaning after we've gotten the receive
|
|
# LSN, the replay LSN may have advanced past the receive LSN we
|
|
# are using for the calculation.
|
|
query: |
|
|
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
|
|
|
- metric_name: replication_delay_seconds
|
|
type: gauge
|
|
help: 'Time since last LSN was replayed'
|
|
key_labels:
|
|
values: [replication_delay_seconds]
|
|
query: |
|
|
SELECT
|
|
CASE
|
|
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
|
ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
|
END AS replication_delay_seconds;
|
|
|
|
- metric_name: checkpoints_req
|
|
type: gauge
|
|
help: 'Number of requested checkpoints'
|
|
key_labels:
|
|
values: [checkpoints_req]
|
|
query: |
|
|
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
|
|
|
- metric_name: checkpoints_timed
|
|
type: gauge
|
|
help: 'Number of scheduled checkpoints'
|
|
key_labels:
|
|
values: [checkpoints_timed]
|
|
query: |
|
|
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
|
|
|
- metric_name: compute_logical_snapshot_files
|
|
type: gauge
|
|
help: 'Number of snapshot files in pg_logical/snapshot'
|
|
key_labels:
|
|
- timeline_id
|
|
values: [num_logical_snapshot_files]
|
|
query: |
|
|
SELECT
|
|
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
|
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
|
|
-- temporary snapshot files are renamed to the actual snapshot files after they are
|
|
-- completely built. We only WAL-log the completely built snapshot files.
|
|
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
|
|
|
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
|
|
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
|
|
|
|
# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
|
|
- metric_name: logical_slot_restart_lsn
|
|
type: gauge
|
|
help: 'restart_lsn of logical slots'
|
|
key_labels:
|
|
- slot_name
|
|
values: [restart_lsn]
|
|
query: |
|
|
select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
|
from pg_replication_slots
|
|
where slot_type = 'logical';
|
|
|
|
- metric_name: compute_subscriptions_count
|
|
type: gauge
|
|
help: 'Number of logical replication subscriptions grouped by enabled/disabled'
|
|
key_labels:
|
|
- enabled
|
|
values: [subscriptions_count]
|
|
query: |
|
|
select subenabled::text as enabled, count(*) as subscriptions_count
|
|
from pg_subscription
|
|
group by subenabled;
|
|
|
|
- metric_name: retained_wal
|
|
type: gauge
|
|
help: 'Retained WAL in inactive replication slots'
|
|
key_labels:
|
|
- slot_name
|
|
values: [retained_wal]
|
|
query: |
|
|
SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
|
FROM pg_replication_slots
|
|
WHERE active = false;
|
|
|
|
- metric_name: wal_is_lost
|
|
type: gauge
|
|
help: 'Whether or not the replication slot wal_status is lost'
|
|
key_labels:
|
|
- slot_name
|
|
values: [wal_is_lost]
|
|
query: |
|
|
SELECT slot_name,
|
|
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
|
|
FROM pg_replication_slots;
|
|
|
|
queries:
|
|
- query_name: neon_perf_counters
|
|
query: |
|
|
WITH c AS (
|
|
SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
|
|
)
|
|
SELECT d.*
|
|
FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
|
|
getpage_wait_seconds_count numeric,
|
|
getpage_wait_seconds_sum numeric,
|
|
getpage_prefetch_requests_total numeric,
|
|
getpage_sync_requests_total numeric,
|
|
getpage_prefetch_misses_total numeric,
|
|
getpage_prefetch_discards_total numeric,
|
|
pageserver_requests_sent_total numeric,
|
|
pageserver_disconnects_total numeric,
|
|
pageserver_send_flushes_total numeric
|
|
);
|
|
|
|
- query_name: getpage_wait_seconds_buckets
|
|
query: |
|
|
SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
|