mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-25 23:29:59 +00:00
Generate sql_exporter config files with Jsonnet
There are quite a few benefits to this approach:
- Reduce config duplication
- The two sql_exporter configs were super similar with just a few
differences
- Pull SQL queries into standalone files
- That means we could run a SQL formatter on the file in the future
- It also means access to syntax highlighting
- In the future, run different queries for different PG versions
- This is relevant because right now, we have queries that are failing
on PG 17 due to catalog updates
Signed-off-by: Tristan Partin <tristan@neon.tech>
This commit is contained in:
19
.github/workflows/build_and_test.yml
vendored
19
.github/workflows/build_and_test.yml
vendored
@@ -120,6 +120,25 @@ jobs:
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-jsonnet:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check Jsonnet code formatting
|
||||
run: |
|
||||
jsonnetfmt --test \
|
||||
$(find . -type f -name '*.jsonnet' -o -name '*.libsonnet')
|
||||
|
||||
# Check that the vendor/postgres-* submodules point to the
|
||||
# corresponding REL_*_STABLE_neon branches.
|
||||
check-submodules:
|
||||
|
||||
@@ -27,6 +27,7 @@ RUN set -e \
|
||||
gnupg \
|
||||
gzip \
|
||||
jq \
|
||||
jsonnet \
|
||||
libcurl4-openssl-dev \
|
||||
libbz2-dev \
|
||||
libffi-dev \
|
||||
|
||||
1
Makefile
1
Makefile
@@ -291,6 +291,7 @@ postgres-check: \
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean: postgres-clean neon-pg-clean-ext
|
||||
$(MAKE) -C compute clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
|
||||
5
compute/.gitignore
vendored
Normal file
5
compute/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
# sql_exporter config files generated from Jsonnet
|
||||
etc/neon_collector.yml
|
||||
etc/neon_collector_autoscaling.yml
|
||||
etc/sql_exporter.yml
|
||||
etc/sql_exporter_autoscaling.yml
|
||||
@@ -349,7 +349,7 @@ ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# doesn't use releases, last commit f3d82fd - Mar 2, 2023
|
||||
# doesn't use releases, last commit f3d82fd - Mar 2, 2023
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1169,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Preprocess the sql_exporter configuration files
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
|
||||
|
||||
USER nonroot
|
||||
|
||||
COPY --chown=nonroot compute compute
|
||||
|
||||
RUN make -C compute
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1287,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
35
compute/Makefile
Normal file
35
compute/Makefile
Normal file
@@ -0,0 +1,35 @@
|
||||
jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet)
|
||||
|
||||
.PHONY: all
|
||||
all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml
|
||||
|
||||
neon_collector.yml: $(jsonnet_files)
|
||||
JSONNET_PATH=etc jsonnet \
|
||||
--output-file etc/$@ \
|
||||
etc/neon_collector.jsonnet
|
||||
|
||||
neon_collector_autoscaling.yml: $(jsonnet_files)
|
||||
JSONNET_PATH=etc jsonnet \
|
||||
--output-file etc/$@ \
|
||||
etc/neon_collector_autoscaling.jsonnet
|
||||
|
||||
sql_exporter.yml: $(jsonnet_files)
|
||||
JSONNET_PATH=etc jsonnet \
|
||||
--output-file etc/$@ \
|
||||
--tla-str collector_file=neon_collector.yml \
|
||||
etc/sql_exporter.jsonnet
|
||||
|
||||
sql_exporter_autoscaling.yml: $(jsonnet_files)
|
||||
JSONNET_PATH=etc jsonnet \
|
||||
--output-file etc/$@ \
|
||||
--tla-str collector_file=neon_collector_autoscaling.yml \
|
||||
--tla-str application_name=sql_exporter_autoscaling \
|
||||
etc/sql_exporter.jsonnet
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm --force \
|
||||
etc/neon_collector.yml \
|
||||
etc/neon_collector_autoscaling.yml \
|
||||
etc/sql_exporter.yml \
|
||||
etc/sql_exporter_autoscaling.yml
|
||||
17
compute/etc/README.md
Normal file
17
compute/etc/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# Compute Configuration
|
||||
|
||||
These files are the configuration files for various other pieces of software
|
||||
that will be running in the compute alongside Postgres.
|
||||
|
||||
## `sql_exporter`
|
||||
|
||||
### Adding a `sql_exporter` Metric
|
||||
|
||||
We use `sql_exporter` to export various metrics from Postgres. In order to add
|
||||
a metric, you will need to create two files: a `libsonnet` and a `sql` file. You
|
||||
will then import the `libsonnet` file in one of the collector files, and the
|
||||
`sql` file will be imported in the `libsonnet` file.
|
||||
|
||||
In the event your statistic is an LSN, you may want to cast it to a `float8`
|
||||
because Prometheus only supports floats. It's probably fine because `float8` can
|
||||
store integers from `-2^53` to `+2^53` exactly.
|
||||
43
compute/etc/neon_collector.jsonnet
Normal file
43
compute/etc/neon_collector.jsonnet
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
collector_name: 'neon_collector',
|
||||
metrics: [
|
||||
import 'sql_exporter/checkpoints_req.libsonnet',
|
||||
import 'sql_exporter/checkpoints_timed.libsonnet',
|
||||
import 'sql_exporter/compute_current_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
|
||||
import 'sql_exporter/compute_receive_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_subscriptions_count.libsonnet',
|
||||
import 'sql_exporter/connection_counts.libsonnet',
|
||||
import 'sql_exporter/db_total_size.libsonnet',
|
||||
import 'sql_exporter/getpage_prefetch_discards_total.libsonnet',
|
||||
import 'sql_exporter/getpage_prefetch_misses_total.libsonnet',
|
||||
import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
|
||||
import 'sql_exporter/getpage_sync_requests_total.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
|
||||
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
|
||||
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
|
||||
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
|
||||
import 'sql_exporter/lfc_hits.libsonnet',
|
||||
import 'sql_exporter/lfc_misses.libsonnet',
|
||||
import 'sql_exporter/lfc_used.libsonnet',
|
||||
import 'sql_exporter/lfc_writes.libsonnet',
|
||||
import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
|
||||
import 'sql_exporter/max_cluster_size.libsonnet',
|
||||
import 'sql_exporter/pageserver_disconnects_total.libsonnet',
|
||||
import 'sql_exporter/pageserver_requests_sent_total.libsonnet',
|
||||
import 'sql_exporter/pageserver_send_flushes_total.libsonnet',
|
||||
import 'sql_exporter/pg_stats_userdb.libsonnet',
|
||||
import 'sql_exporter/replication_delay_bytes.libsonnet',
|
||||
import 'sql_exporter/replication_delay_seconds.libsonnet',
|
||||
import 'sql_exporter/retained_wal.libsonnet',
|
||||
import 'sql_exporter/wal_is_lost.libsonnet',
|
||||
],
|
||||
queries: [
|
||||
{
|
||||
query_name: 'neon_perf_counters',
|
||||
query: importstr 'sql_exporter/neon_perf_counters.sql',
|
||||
},
|
||||
],
|
||||
}
|
||||
@@ -1,331 +0,0 @@
|
||||
collector_name: neon_collector
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: connection_counts
|
||||
type: gauge
|
||||
help: 'Connection counts'
|
||||
key_labels:
|
||||
- datname
|
||||
- state
|
||||
values: [count]
|
||||
query: |
|
||||
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for several oldest non-system dbs'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
values:
|
||||
- db_size
|
||||
- deadlocks
|
||||
# Rows
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for 10 non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
datname
|
||||
from pg_stat_database
|
||||
where datname IN (
|
||||
select datname
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 10
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
type: gauge
|
||||
help: 'neon.max_cluster_size setting'
|
||||
key_labels:
|
||||
values: [max_cluster_size]
|
||||
query: |
|
||||
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
||||
|
||||
- metric_name: db_total_size
|
||||
type: gauge
|
||||
help: 'Size of all databases'
|
||||
key_labels:
|
||||
values: [total]
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
- metric_name: getpage_wait_seconds_count
|
||||
type: counter
|
||||
help: 'Number of getpage requests'
|
||||
values: [getpage_wait_seconds_count]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_wait_seconds_sum
|
||||
type: counter
|
||||
help: 'Time spent in getpage requests'
|
||||
values: [getpage_wait_seconds_sum]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_requests_total
|
||||
type: counter
|
||||
help: 'Number of getpage issued for prefetching'
|
||||
values: [getpage_prefetch_requests_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_sync_requests_total
|
||||
type: counter
|
||||
help: 'Number of synchronous getpage issued'
|
||||
values: [getpage_sync_requests_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_misses_total
|
||||
type: counter
|
||||
help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
|
||||
values: [getpage_prefetch_misses_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_discards_total
|
||||
type: counter
|
||||
help: 'Number of prefetch responses issued but not used'
|
||||
values: [getpage_prefetch_discards_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_requests_sent_total
|
||||
type: counter
|
||||
help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
|
||||
values: [pageserver_requests_sent_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_disconnects_total
|
||||
type: counter
|
||||
help: 'Number of times that the connection to the pageserver was lost'
|
||||
values: [pageserver_disconnects_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_send_flushes_total
|
||||
type: counter
|
||||
help: 'Number of flushes to the pageserver connection'
|
||||
values: [pageserver_send_flushes_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_wait_seconds_bucket
|
||||
type: counter
|
||||
help: 'Histogram buckets of getpage request latency'
|
||||
key_labels:
|
||||
- bucket_le
|
||||
values: [value]
|
||||
query_ref: getpage_wait_seconds_buckets
|
||||
|
||||
# DEPRECATED
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels:
|
||||
values: [approximate_working_set_size]
|
||||
query: |
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration]
|
||||
values: [size]
|
||||
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
||||
# of durations in a pretty-printed form.
|
||||
query: |
|
||||
select
|
||||
x as duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
||||
from
|
||||
(values ('5m'),('15m'),('1h')) as t (x);
|
||||
|
||||
- metric_name: compute_current_lsn
|
||||
type: gauge
|
||||
help: 'Current LSN of the database'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
select
|
||||
case
|
||||
when pg_catalog.pg_is_in_recovery()
|
||||
then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
else (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
end as lsn;
|
||||
|
||||
- metric_name: compute_receive_lsn
|
||||
type: gauge
|
||||
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery()
|
||||
THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
- metric_name: replication_delay_bytes
|
||||
type: gauge
|
||||
help: 'Bytes between received and replayed LSN'
|
||||
key_labels:
|
||||
values: [replication_delay_bytes]
|
||||
# We use a GREATEST call here because this calculation can be negative.
|
||||
# The calculation is not atomic, meaning after we've gotten the receive
|
||||
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||
# are using for the calculation.
|
||||
query: |
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
- metric_name: replication_delay_seconds
|
||||
type: gauge
|
||||
help: 'Time since last LSN was replayed'
|
||||
key_labels:
|
||||
values: [replication_delay_seconds]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
- metric_name: checkpoints_req
|
||||
type: gauge
|
||||
help: 'Number of requested checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_req]
|
||||
query: |
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: checkpoints_timed
|
||||
type: gauge
|
||||
help: 'Number of scheduled checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_timed]
|
||||
query: |
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: compute_logical_snapshot_files
|
||||
type: gauge
|
||||
help: 'Number of snapshot files in pg_logical/snapshot'
|
||||
key_labels:
|
||||
- timeline_id
|
||||
values: [num_logical_snapshot_files]
|
||||
query: |
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
|
||||
-- temporary snapshot files are renamed to the actual snapshot files after they are
|
||||
-- completely built. We only WAL-log the completely built snapshot files.
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
|
||||
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
|
||||
|
||||
# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
|
||||
- metric_name: logical_slot_restart_lsn
|
||||
type: gauge
|
||||
help: 'restart_lsn of logical slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [restart_lsn]
|
||||
query: |
|
||||
select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
from pg_replication_slots
|
||||
where slot_type = 'logical';
|
||||
|
||||
- metric_name: compute_subscriptions_count
|
||||
type: gauge
|
||||
help: 'Number of logical replication subscriptions grouped by enabled/disabled'
|
||||
key_labels:
|
||||
- enabled
|
||||
values: [subscriptions_count]
|
||||
query: |
|
||||
select subenabled::text as enabled, count(*) as subscriptions_count
|
||||
from pg_subscription
|
||||
group by subenabled;
|
||||
|
||||
- metric_name: retained_wal
|
||||
type: gauge
|
||||
help: 'Retained WAL in inactive replication slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [retained_wal]
|
||||
query: |
|
||||
SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
- metric_name: wal_is_lost
|
||||
type: gauge
|
||||
help: 'Whether or not the replication slot wal_status is lost'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [wal_is_lost]
|
||||
query: |
|
||||
SELECT slot_name,
|
||||
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
|
||||
queries:
|
||||
- query_name: neon_perf_counters
|
||||
query: |
|
||||
WITH c AS (
|
||||
SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
|
||||
)
|
||||
SELECT d.*
|
||||
FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
|
||||
getpage_wait_seconds_count numeric,
|
||||
getpage_wait_seconds_sum numeric,
|
||||
getpage_prefetch_requests_total numeric,
|
||||
getpage_sync_requests_total numeric,
|
||||
getpage_prefetch_misses_total numeric,
|
||||
getpage_prefetch_discards_total numeric,
|
||||
pageserver_requests_sent_total numeric,
|
||||
pageserver_disconnects_total numeric,
|
||||
pageserver_send_flushes_total numeric
|
||||
);
|
||||
|
||||
- query_name: getpage_wait_seconds_buckets
|
||||
query: |
|
||||
SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
|
||||
11
compute/etc/neon_collector_autoscaling.jsonnet
Normal file
11
compute/etc/neon_collector_autoscaling.jsonnet
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
collector_name: 'neon_collector_autoscaling',
|
||||
metrics: [
|
||||
import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet',
|
||||
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
|
||||
import 'sql_exporter/lfc_hits.libsonnet',
|
||||
import 'sql_exporter/lfc_misses.libsonnet',
|
||||
import 'sql_exporter/lfc_used.libsonnet',
|
||||
import 'sql_exporter/lfc_writes.libsonnet',
|
||||
],
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
collector_name: neon_collector_autoscaling
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration_seconds]
|
||||
values: [size]
|
||||
# NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
|
||||
# size looking back 1..60 minutes, labeled with the number of minutes.
|
||||
query: |
|
||||
select
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60)) as t (x);
|
||||
40
compute/etc/sql_exporter.jsonnet
Normal file
40
compute/etc/sql_exporter.jsonnet
Normal file
@@ -0,0 +1,40 @@
|
||||
function(collector_file, application_name='sql_exporter') {
|
||||
// Configuration for sql_exporter for autoscaling-agent
|
||||
// Global defaults.
|
||||
global: {
|
||||
// If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: '10s',
|
||||
// Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: '500ms',
|
||||
// Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: '0s',
|
||||
// Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
// as will concurrent scrapes.
|
||||
max_connections: 1,
|
||||
// Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
// always be the same as max_connections.
|
||||
max_idle_connections: 1,
|
||||
// Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
// If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: '5m',
|
||||
},
|
||||
|
||||
// The target to monitor and the collectors to execute on it.
|
||||
target: {
|
||||
// Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
// the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
|
||||
|
||||
// Collectors (referenced by name) to execute on the target.
|
||||
// Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [
|
||||
'neon_collector_autoscaling',
|
||||
],
|
||||
},
|
||||
|
||||
// Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
// Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files: [
|
||||
collector_file,
|
||||
],
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
# Configuration for sql_exporter
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector.yml"
|
||||
10
compute/etc/sql_exporter/checkpoints_req.libsonnet
Normal file
10
compute/etc/sql_exporter/checkpoints_req.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'checkpoints_req',
|
||||
type: 'gauge',
|
||||
help: 'Number of requested checkpoints',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'checkpoints_req',
|
||||
],
|
||||
query: importstr 'sql_exporter/checkpoints_req.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/checkpoints_req.sql
Normal file
1
compute/etc/sql_exporter/checkpoints_req.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
10
compute/etc/sql_exporter/checkpoints_timed.libsonnet
Normal file
10
compute/etc/sql_exporter/checkpoints_timed.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'checkpoints_timed',
|
||||
type: 'gauge',
|
||||
help: 'Number of scheduled checkpoints',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'checkpoints_timed',
|
||||
],
|
||||
query: importstr 'sql_exporter/checkpoints_timed.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/checkpoints_timed.sql
Normal file
1
compute/etc/sql_exporter/checkpoints_timed.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
10
compute/etc/sql_exporter/compute_current_lsn.libsonnet
Normal file
10
compute/etc/sql_exporter/compute_current_lsn.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'compute_current_lsn',
|
||||
type: 'gauge',
|
||||
help: 'Current LSN of the database',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lsn',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_current_lsn.sql',
|
||||
}
|
||||
4
compute/etc/sql_exporter/compute_current_lsn.sql
Normal file
4
compute/etc/sql_exporter/compute_current_lsn.sql
Normal file
@@ -0,0 +1,4 @@
|
||||
SELECT CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
END AS lsn;
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'compute_logical_snapshot_files',
|
||||
type: 'gauge',
|
||||
help: 'Number of snapshot files in pg_logical/snapshot',
|
||||
key_labels: [
|
||||
'timeline_id',
|
||||
],
|
||||
values: [
|
||||
'num_logical_snapshot_files',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_logical_snapshot_files.sql',
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
10
compute/etc/sql_exporter/compute_receive_lsn.libsonnet
Normal file
10
compute/etc/sql_exporter/compute_receive_lsn.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'compute_receive_lsn',
|
||||
type: 'gauge',
|
||||
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lsn',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_receive_lsn.sql',
|
||||
}
|
||||
4
compute/etc/sql_exporter/compute_receive_lsn.sql
Normal file
4
compute/etc/sql_exporter/compute_receive_lsn.sql
Normal file
@@ -0,0 +1,4 @@
|
||||
SELECT CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'compute_subscriptions_count',
|
||||
type: 'gauge',
|
||||
help: 'Number of logical replication subscriptions grouped by enabled/disabled',
|
||||
key_labels: [
|
||||
'enabled',
|
||||
],
|
||||
values: [
|
||||
'subscriptions_count',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_subscriptions_count.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/compute_subscriptions_count.sql
Normal file
1
compute/etc/sql_exporter/compute_subscriptions_count.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
|
||||
13
compute/etc/sql_exporter/connection_counts.libsonnet
Normal file
13
compute/etc/sql_exporter/connection_counts.libsonnet
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
metric_name: 'connection_counts',
|
||||
type: 'gauge',
|
||||
help: 'Connection counts',
|
||||
key_labels: [
|
||||
'datname',
|
||||
'state',
|
||||
],
|
||||
values: [
|
||||
'count',
|
||||
],
|
||||
query: importstr 'sql_exporter/connection_counts.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/connection_counts.sql
Normal file
1
compute/etc/sql_exporter/connection_counts.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
|
||||
10
compute/etc/sql_exporter/db_total_size.libsonnet
Normal file
10
compute/etc/sql_exporter/db_total_size.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'db_total_size',
|
||||
type: 'gauge',
|
||||
help: 'Size of all databases',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'total',
|
||||
],
|
||||
query: importstr 'sql_exporter/db_total_size.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/db_total_size.sql
Normal file
1
compute/etc/sql_exporter/db_total_size.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'getpage_prefetch_discards_total',
|
||||
type: 'counter',
|
||||
help: 'Number of prefetch responses issued but not used',
|
||||
values: [
|
||||
'getpage_prefetch_discards_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'getpage_prefetch_misses_total',
|
||||
type: 'counter',
|
||||
help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read",
|
||||
values: [
|
||||
'getpage_prefetch_misses_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'getpage_prefetch_requests_total',
|
||||
type: 'counter',
|
||||
help: 'Number of getpage issued for prefetching',
|
||||
values: [
|
||||
'getpage_prefetch_requests_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'getpage_sync_requests_total',
|
||||
type: 'counter',
|
||||
help: 'Number of synchronous getpage issued',
|
||||
values: [
|
||||
'getpage_sync_requests_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'getpage_wait_seconds_bucket',
|
||||
type: 'counter',
|
||||
help: 'Histogram buckets of getpage request latency',
|
||||
key_labels: [
|
||||
'bucket_le',
|
||||
],
|
||||
values: [
|
||||
'value',
|
||||
],
|
||||
query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
Normal file
1
compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'getpage_wait_seconds_count',
|
||||
type: 'counter',
|
||||
help: 'Number of getpage requests',
|
||||
values: [
|
||||
'getpage_wait_seconds_count',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'getpage_wait_seconds_sum',
|
||||
type: 'counter',
|
||||
help: 'Time spent in getpage requests',
|
||||
values: [
|
||||
'getpage_wait_seconds_sum',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
// DEPRECATED
|
||||
|
||||
{
|
||||
metric_name: 'lfc_approximate_working_set_size',
|
||||
type: 'gauge',
|
||||
help: 'Approximate working set size in pages of 8192 bytes',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'approximate_working_set_size',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql',
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size;
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'lfc_approximate_working_set_size_windows',
|
||||
type: 'gauge',
|
||||
help: 'Approximate working set size in pages of 8192 bytes',
|
||||
key_labels: [
|
||||
'duration_seconds',
|
||||
],
|
||||
values: [
|
||||
'size',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql',
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
-- NOTE: This is the "internal" / "machine-readable" version. This outputs the
|
||||
-- working set size looking back 1..60 minutes, labeled with the number of
|
||||
-- minutes.
|
||||
|
||||
SELECT
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) AS size
|
||||
FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'lfc_approximate_working_set_size_windows',
|
||||
type: 'gauge',
|
||||
help: 'Approximate working set size in pages of 8192 bytes',
|
||||
key_labels: [
|
||||
'duration',
|
||||
],
|
||||
values: [
|
||||
'size',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql',
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
-- NOTE: This is the "public" / "human-readable" version. Here, we supply a
|
||||
-- small selection of durations in a pretty-printed form.
|
||||
|
||||
SELECT
|
||||
x AS duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
|
||||
VALUES ('5m'), ('15m'), ('1h')
|
||||
) AS t (x);
|
||||
10
compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_cache_size_limit',
|
||||
type: 'gauge',
|
||||
help: 'LFC cache size limit in bytes',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_cache_size_limit',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_cache_size_limit.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_cache_size_limit.sql
Normal file
1
compute/etc/sql_exporter/lfc_cache_size_limit.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
|
||||
10
compute/etc/sql_exporter/lfc_hits.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_hits.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_hits',
|
||||
type: 'gauge',
|
||||
help: 'lfc_hits',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_hits',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_hits.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_hits.sql
Normal file
1
compute/etc/sql_exporter/lfc_hits.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits';
|
||||
10
compute/etc/sql_exporter/lfc_misses.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_misses.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_misses',
|
||||
type: 'gauge',
|
||||
help: 'lfc_misses',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_misses',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_misses.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_misses.sql
Normal file
1
compute/etc/sql_exporter/lfc_misses.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses';
|
||||
10
compute/etc/sql_exporter/lfc_used.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_used.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_used',
|
||||
type: 'gauge',
|
||||
help: 'LFC chunks used (chunk = 1MB)',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_used',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_used.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_used.sql
Normal file
1
compute/etc/sql_exporter/lfc_used.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used';
|
||||
10
compute/etc/sql_exporter/lfc_writes.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_writes.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_writes',
|
||||
type: 'gauge',
|
||||
help: 'lfc_writes',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_writes',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_writes.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_writes.sql
Normal file
1
compute/etc/sql_exporter/lfc_writes.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes';
|
||||
15
compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
Normal file
15
compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
Normal file
@@ -0,0 +1,15 @@
|
||||
// Number of slots is limited by max_replication_slots, so collecting position
|
||||
// for all of them shouldn't be bad.
|
||||
|
||||
{
|
||||
metric_name: 'logical_slot_restart_lsn',
|
||||
type: 'gauge',
|
||||
help: 'restart_lsn of logical slots',
|
||||
key_labels: [
|
||||
'slot_name',
|
||||
],
|
||||
values: [
|
||||
'restart_lsn',
|
||||
],
|
||||
query: importstr 'sql_exporter/logical_slot_restart_lsn.sql',
|
||||
}
|
||||
3
compute/etc/sql_exporter/logical_slot_restart_lsn.sql
Normal file
3
compute/etc/sql_exporter/logical_slot_restart_lsn.sql
Normal file
@@ -0,0 +1,3 @@
|
||||
SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
FROM pg_replication_slots
|
||||
WHERE slot_type = 'logical';
|
||||
10
compute/etc/sql_exporter/max_cluster_size.libsonnet
Normal file
10
compute/etc/sql_exporter/max_cluster_size.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'max_cluster_size',
|
||||
type: 'gauge',
|
||||
help: 'neon.max_cluster_size setting',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'max_cluster_size',
|
||||
],
|
||||
query: importstr 'sql_exporter/max_cluster_size.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/max_cluster_size.sql
Normal file
1
compute/etc/sql_exporter/max_cluster_size.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
|
||||
13
compute/etc/sql_exporter/neon_perf_counters.sql
Normal file
13
compute/etc/sql_exporter/neon_perf_counters.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters)
|
||||
|
||||
SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
|
||||
getpage_wait_seconds_count numeric,
|
||||
getpage_wait_seconds_sum numeric,
|
||||
getpage_prefetch_requests_total numeric,
|
||||
getpage_sync_requests_total numeric,
|
||||
getpage_prefetch_misses_total numeric,
|
||||
getpage_prefetch_discards_total numeric,
|
||||
pageserver_requests_sent_total numeric,
|
||||
pageserver_disconnects_total numeric,
|
||||
pageserver_send_flushes_total numeric
|
||||
);
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'pageserver_disconnects_total',
|
||||
type: 'counter',
|
||||
help: 'Number of times that the connection to the pageserver was lost',
|
||||
values: [
|
||||
'pageserver_disconnects_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'pageserver_requests_sent_total',
|
||||
type: 'counter',
|
||||
help: 'Number of all requests sent to the pageserver (not just GetPage requests)',
|
||||
values: [
|
||||
'pageserver_requests_sent_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'pageserver_send_flushes_total',
|
||||
type: 'counter',
|
||||
help: 'Number of flushes to the pageserver connection',
|
||||
values: [
|
||||
'pageserver_send_flushes_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
18
compute/etc/sql_exporter/pg_stats_userdb.libsonnet
Normal file
18
compute/etc/sql_exporter/pg_stats_userdb.libsonnet
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
metric_name: 'pg_stats_userdb',
|
||||
type: 'gauge',
|
||||
help: 'Stats for several oldest non-system dbs',
|
||||
key_labels: [
|
||||
'datname',
|
||||
],
|
||||
value_label: 'kind',
|
||||
values: [
|
||||
'db_size',
|
||||
'deadlocks',
|
||||
// Rows
|
||||
'inserted',
|
||||
'updated',
|
||||
'deleted',
|
||||
],
|
||||
query: importstr 'sql_exporter/pg_stats_userdb.sql',
|
||||
}
|
||||
10
compute/etc/sql_exporter/pg_stats_userdb.sql
Normal file
10
compute/etc/sql_exporter/pg_stats_userdb.sql
Normal file
@@ -0,0 +1,10 @@
|
||||
-- We export stats for 10 non-system databases. Without this limit it is too
|
||||
-- easy to abuse the system by creating lots of databases.
|
||||
|
||||
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
|
||||
tup_updated AS updated, tup_deleted AS deleted, datname
|
||||
FROM pg_stat_database
|
||||
WHERE datname IN (
|
||||
SELECT datname FROM pg_database
|
||||
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
|
||||
);
|
||||
10
compute/etc/sql_exporter/replication_delay_bytes.libsonnet
Normal file
10
compute/etc/sql_exporter/replication_delay_bytes.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'replication_delay_bytes',
|
||||
type: 'gauge',
|
||||
help: 'Bytes between received and replayed LSN',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'replication_delay_bytes',
|
||||
],
|
||||
query: importstr 'sql_exporter/replication_delay_bytes.sql',
|
||||
}
|
||||
6
compute/etc/sql_exporter/replication_delay_bytes.sql
Normal file
6
compute/etc/sql_exporter/replication_delay_bytes.sql
Normal file
@@ -0,0 +1,6 @@
|
||||
-- We use a GREATEST call here because this calculation can be negative. The
|
||||
-- calculation is not atomic, meaning after we've gotten the receive LSN, the
|
||||
-- replay LSN may have advanced past the receive LSN we are using for the
|
||||
-- calculation.
|
||||
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
10
compute/etc/sql_exporter/replication_delay_seconds.libsonnet
Normal file
10
compute/etc/sql_exporter/replication_delay_seconds.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'replication_delay_seconds',
|
||||
type: 'gauge',
|
||||
help: 'Time since last LSN was replayed',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'replication_delay_seconds',
|
||||
],
|
||||
query: importstr 'sql_exporter/replication_delay_seconds.sql',
|
||||
}
|
||||
5
compute/etc/sql_exporter/replication_delay_seconds.sql
Normal file
5
compute/etc/sql_exporter/replication_delay_seconds.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
12
compute/etc/sql_exporter/retained_wal.libsonnet
Normal file
12
compute/etc/sql_exporter/retained_wal.libsonnet
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'retained_wal',
|
||||
type: 'gauge',
|
||||
help: 'Retained WAL in inactive replication slots',
|
||||
key_labels: [
|
||||
'slot_name',
|
||||
],
|
||||
values: [
|
||||
'retained_wal',
|
||||
],
|
||||
query: importstr 'sql_exporter/retained_wal.sql',
|
||||
}
|
||||
5
compute/etc/sql_exporter/retained_wal.sql
Normal file
5
compute/etc/sql_exporter/retained_wal.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
SELECT
|
||||
slot_name,
|
||||
pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
12
compute/etc/sql_exporter/wal_is_lost.libsonnet
Normal file
12
compute/etc/sql_exporter/wal_is_lost.libsonnet
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
metric_name: 'wal_is_lost',
|
||||
type: 'gauge',
|
||||
help: 'Whether or not the replication slot wal_status is lost',
|
||||
key_labels: [
|
||||
'slot_name',
|
||||
],
|
||||
values: [
|
||||
'wal_is_lost',
|
||||
],
|
||||
query: importstr 'sql_exporter/wal_is_lost.sql',
|
||||
}
|
||||
7
compute/etc/sql_exporter/wal_is_lost.sql
Normal file
7
compute/etc/sql_exporter/wal_is_lost.sql
Normal file
@@ -0,0 +1,7 @@
|
||||
SELECT
|
||||
slot_name,
|
||||
CASE
|
||||
WHEN wal_status = 'lost' THEN 1
|
||||
ELSE 0
|
||||
END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
@@ -1,33 +0,0 @@
|
||||
# Configuration for sql_exporter for autoscaling-agent
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector_autoscaling]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector_autoscaling.yml"
|
||||
Reference in New Issue
Block a user