From cf7a596a151487c1b3afafbe1eb2efab895326ea Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 11:18:38 -0500 Subject: [PATCH] Generate sql_exporter config files with Jsonnet There are quite a few benefits to this approach: - Reduce config duplication - The two sql_exporter configs were super similar with just a few differences - Pull SQL queries into standalone files - That means we could run a SQL formatter on the file in the future - It also means access to syntax highlighting - In the future, run different queries for different PG versions - This is relevant because right now, we have queries that are failing on PG 17 due to catalog updates Signed-off-by: Tristan Partin --- .github/workflows/build_and_test.yml | 19 + Dockerfile.build-tools | 1 + Makefile | 1 + compute/.gitignore | 5 + compute/Dockerfile.compute-node | 22 +- compute/Makefile | 35 ++ compute/etc/README.md | 17 + compute/etc/neon_collector.jsonnet | 43 +++ compute/etc/neon_collector.yml | 331 ------------------ .../etc/neon_collector_autoscaling.jsonnet | 11 + compute/etc/neon_collector_autoscaling.yml | 55 --- compute/etc/sql_exporter.jsonnet | 40 +++ compute/etc/sql_exporter.yml | 33 -- .../sql_exporter/checkpoints_req.libsonnet | 10 + compute/etc/sql_exporter/checkpoints_req.sql | 1 + .../sql_exporter/checkpoints_timed.libsonnet | 10 + .../etc/sql_exporter/checkpoints_timed.sql | 1 + .../compute_current_lsn.libsonnet | 10 + .../etc/sql_exporter/compute_current_lsn.sql | 4 + .../compute_logical_snapshot_files.libsonnet | 12 + .../compute_logical_snapshot_files.sql | 7 + .../compute_receive_lsn.libsonnet | 10 + .../etc/sql_exporter/compute_receive_lsn.sql | 4 + .../compute_subscriptions_count.libsonnet | 12 + .../compute_subscriptions_count.sql | 1 + .../sql_exporter/connection_counts.libsonnet | 13 + .../etc/sql_exporter/connection_counts.sql | 1 + .../etc/sql_exporter/db_total_size.libsonnet | 10 + compute/etc/sql_exporter/db_total_size.sql | 1 + .../getpage_prefetch_discards_total.libsonnet | 9 + .../getpage_prefetch_misses_total.libsonnet | 9 + .../getpage_prefetch_requests_total.libsonnet | 9 + .../getpage_sync_requests_total.libsonnet | 9 + .../getpage_wait_seconds_bucket.libsonnet | 12 + .../getpage_wait_seconds_bucket.sql | 1 + .../getpage_wait_seconds_count.libsonnet | 9 + .../getpage_wait_seconds_sum.libsonnet | 9 + ...lfc_approximate_working_set_size.libsonnet | 12 + .../lfc_approximate_working_set_size.sql | 1 + ...ing_set_size_windows.autoscaling.libsonnet | 12 + ...e_working_set_size_windows.autoscaling.sql | 8 + ...oximate_working_set_size_windows.libsonnet | 12 + ...c_approximate_working_set_size_windows.sql | 8 + .../lfc_cache_size_limit.libsonnet | 10 + .../etc/sql_exporter/lfc_cache_size_limit.sql | 1 + compute/etc/sql_exporter/lfc_hits.libsonnet | 10 + compute/etc/sql_exporter/lfc_hits.sql | 1 + compute/etc/sql_exporter/lfc_misses.libsonnet | 10 + compute/etc/sql_exporter/lfc_misses.sql | 1 + compute/etc/sql_exporter/lfc_used.libsonnet | 10 + compute/etc/sql_exporter/lfc_used.sql | 1 + compute/etc/sql_exporter/lfc_writes.libsonnet | 10 + compute/etc/sql_exporter/lfc_writes.sql | 1 + .../logical_slot_restart_lsn.libsonnet | 15 + .../sql_exporter/logical_slot_restart_lsn.sql | 3 + .../sql_exporter/max_cluster_size.libsonnet | 10 + compute/etc/sql_exporter/max_cluster_size.sql | 1 + .../etc/sql_exporter/neon_perf_counters.sql | 13 + .../pageserver_disconnects_total.libsonnet | 9 + .../pageserver_requests_sent_total.libsonnet | 9 + .../pageserver_send_flushes_total.libsonnet | 9 + .../sql_exporter/pg_stats_userdb.libsonnet | 18 + compute/etc/sql_exporter/pg_stats_userdb.sql | 10 + .../replication_delay_bytes.libsonnet | 10 + .../sql_exporter/replication_delay_bytes.sql | 6 + .../replication_delay_seconds.libsonnet | 10 + .../replication_delay_seconds.sql | 5 + .../etc/sql_exporter/retained_wal.libsonnet | 12 + compute/etc/sql_exporter/retained_wal.sql | 5 + .../etc/sql_exporter/wal_is_lost.libsonnet | 12 + compute/etc/sql_exporter/wal_is_lost.sql | 7 + compute/etc/sql_exporter_autoscaling.yml | 33 -- 72 files changed, 635 insertions(+), 457 deletions(-) create mode 100644 compute/.gitignore create mode 100644 compute/Makefile create mode 100644 compute/etc/README.md create mode 100644 compute/etc/neon_collector.jsonnet delete mode 100644 compute/etc/neon_collector.yml create mode 100644 compute/etc/neon_collector_autoscaling.jsonnet delete mode 100644 compute/etc/neon_collector_autoscaling.yml create mode 100644 compute/etc/sql_exporter.jsonnet delete mode 100644 compute/etc/sql_exporter.yml create mode 100644 compute/etc/sql_exporter/checkpoints_req.libsonnet create mode 100644 compute/etc/sql_exporter/checkpoints_req.sql create mode 100644 compute/etc/sql_exporter/checkpoints_timed.libsonnet create mode 100644 compute/etc/sql_exporter/checkpoints_timed.sql create mode 100644 compute/etc/sql_exporter/compute_current_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/compute_current_lsn.sql create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.sql create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.sql create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.libsonnet create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.sql create mode 100644 compute/etc/sql_exporter/connection_counts.libsonnet create mode 100644 compute/etc/sql_exporter/connection_counts.sql create mode 100644 compute/etc/sql_exporter/db_total_size.libsonnet create mode 100644 compute/etc/sql_exporter/db_total_size.sql create mode 100644 compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.sql create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.sql create mode 100644 compute/etc/sql_exporter/lfc_hits.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_hits.sql create mode 100644 compute/etc/sql_exporter/lfc_misses.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_misses.sql create mode 100644 compute/etc/sql_exporter/lfc_used.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_used.sql create mode 100644 compute/etc/sql_exporter/lfc_writes.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_writes.sql create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.sql create mode 100644 compute/etc/sql_exporter/max_cluster_size.libsonnet create mode 100644 compute/etc/sql_exporter/max_cluster_size.sql create mode 100644 compute/etc/sql_exporter/neon_perf_counters.sql create mode 100644 compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.libsonnet create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.sql create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.libsonnet create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.sql create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.libsonnet create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.sql create mode 100644 compute/etc/sql_exporter/retained_wal.libsonnet create mode 100644 compute/etc/sql_exporter/retained_wal.sql create mode 100644 compute/etc/sql_exporter/wal_is_lost.libsonnet create mode 100644 compute/etc/sql_exporter/wal_is_lost.sql delete mode 100644 compute/etc/sql_exporter_autoscaling.yml diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 51f6975e63..c9a447626f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -120,6 +120,25 @@ jobs: - name: Run mypy to check types run: poetry run mypy . + check-codestyle-jsonnet: + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check Jsonnet code formatting + run: | + jsonnetfmt --test \ + $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet') + # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 54e9134257..7cba1c8635 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -27,6 +27,7 @@ RUN set -e \ gnupg \ gzip \ jq \ + jsonnet \ libcurl4-openssl-dev \ libbz2-dev \ libffi-dev \ diff --git a/Makefile b/Makefile index 5e227ed3f5..33cfda2661 100644 --- a/Makefile +++ b/Makefile @@ -291,6 +291,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean clean: postgres-clean neon-pg-clean-ext + $(MAKE) -C compute clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/compute/.gitignore b/compute/.gitignore new file mode 100644 index 0000000000..70980d335a --- /dev/null +++ b/compute/.gitignore @@ -0,0 +1,5 @@ +# sql_exporter config files generated from Jsonnet +etc/neon_collector.yml +etc/neon_collector_autoscaling.yml +etc/sql_exporter.yml +etc/sql_exporter_autoscaling.yml diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 412c64eda4..13381b2901 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -349,7 +349,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific -# doesn't use releases, last commit f3d82fd - Mar 2, 2023 +# doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ @@ -1169,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### +# +# Preprocess the sql_exporter configuration files +# +######################################################################################### +FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor + +USER nonroot + +COPY --chown=nonroot compute compute + +RUN make -C compute ######################################################################################### # @@ -1287,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter -COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Create remote extension download directory RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions diff --git a/compute/Makefile b/compute/Makefile new file mode 100644 index 0000000000..45fbfa6d5e --- /dev/null +++ b/compute/Makefile @@ -0,0 +1,35 @@ +jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet) + +.PHONY: all +all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml + +neon_collector.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + etc/neon_collector.jsonnet + +neon_collector_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + etc/neon_collector_autoscaling.jsonnet + +sql_exporter.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector.yml \ + etc/sql_exporter.jsonnet + +sql_exporter_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector_autoscaling.yml \ + --tla-str application_name=sql_exporter_autoscaling \ + etc/sql_exporter.jsonnet + +.PHONY: clean +clean: + rm --force \ + etc/neon_collector.yml \ + etc/neon_collector_autoscaling.yml \ + etc/sql_exporter.yml \ + etc/sql_exporter_autoscaling.yml diff --git a/compute/etc/README.md b/compute/etc/README.md new file mode 100644 index 0000000000..70b108146c --- /dev/null +++ b/compute/etc/README.md @@ -0,0 +1,17 @@ +# Compute Configuration + +These files are the configuration files for various other pieces of software +that will be running in the compute alongside Postgres. + +## `sql_exporter` + +### Adding a `sql_exporter` Metric + +We use `sql_exporter` to export various metrics from Postgres. In order to add +a metric, you will need to create two files: a `libsonnet` and a `sql` file. You +will then import the `libsonnet` file in one of the collector files, and the +`sql` file will be imported in the `libsonnet` file. + +In the event your statistic is an LSN, you may want to cast it to a `float8` +because Prometheus only supports floats. It's probably fine because `float8` can +store integers from `-2^53` to `+2^53` exactly. diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet new file mode 100644 index 0000000000..2031eb8c85 --- /dev/null +++ b/compute/etc/neon_collector.jsonnet @@ -0,0 +1,43 @@ +{ + collector_name: 'neon_collector', + metrics: [ + import 'sql_exporter/checkpoints_req.libsonnet', + import 'sql_exporter/checkpoints_timed.libsonnet', + import 'sql_exporter/compute_current_lsn.libsonnet', + import 'sql_exporter/compute_logical_snapshot_files.libsonnet', + import 'sql_exporter/compute_receive_lsn.libsonnet', + import 'sql_exporter/compute_subscriptions_count.libsonnet', + import 'sql_exporter/connection_counts.libsonnet', + import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', + import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', + import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', + import 'sql_exporter/getpage_wait_seconds_count.libsonnet', + import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + import 'sql_exporter/logical_slot_restart_lsn.libsonnet', + import 'sql_exporter/max_cluster_size.libsonnet', + import 'sql_exporter/pageserver_disconnects_total.libsonnet', + import 'sql_exporter/pageserver_requests_sent_total.libsonnet', + import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pg_stats_userdb.libsonnet', + import 'sql_exporter/replication_delay_bytes.libsonnet', + import 'sql_exporter/replication_delay_seconds.libsonnet', + import 'sql_exporter/retained_wal.libsonnet', + import 'sql_exporter/wal_is_lost.libsonnet', + ], + queries: [ + { + query_name: 'neon_perf_counters', + query: importstr 'sql_exporter/neon_perf_counters.sql', + }, + ], +} diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml deleted file mode 100644 index 92da0cdbdd..0000000000 --- a/compute/etc/neon_collector.yml +++ /dev/null @@ -1,331 +0,0 @@ -collector_name: neon_collector -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: connection_counts - type: gauge - help: 'Connection counts' - key_labels: - - datname - - state - values: [count] - query: | - select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; - -- metric_name: pg_stats_userdb - type: gauge - help: 'Stats for several oldest non-system dbs' - key_labels: - - datname - value_label: kind - values: - - db_size - - deadlocks - # Rows - - inserted - - updated - - deleted - # We export stats for 10 non-system database. Without this limit - # it is too easy to abuse the system by creating lots of databases. - query: | - select pg_database_size(datname) as db_size, deadlocks, - tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, - datname - from pg_stat_database - where datname IN ( - select datname - from pg_database - where datname <> 'postgres' and not datistemplate - order by oid - limit 10 - ); - -- metric_name: max_cluster_size - type: gauge - help: 'neon.max_cluster_size setting' - key_labels: - values: [max_cluster_size] - query: | - select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; - -- metric_name: db_total_size - type: gauge - help: 'Size of all databases' - key_labels: - values: [total] - query: | - select sum(pg_database_size(datname)) as total from pg_database; - -- metric_name: getpage_wait_seconds_count - type: counter - help: 'Number of getpage requests' - values: [getpage_wait_seconds_count] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_sum - type: counter - help: 'Time spent in getpage requests' - values: [getpage_wait_seconds_sum] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_requests_total - type: counter - help: 'Number of getpage issued for prefetching' - values: [getpage_prefetch_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_sync_requests_total - type: counter - help: 'Number of synchronous getpage issued' - values: [getpage_sync_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_misses_total - type: counter - help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' - values: [getpage_prefetch_misses_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_discards_total - type: counter - help: 'Number of prefetch responses issued but not used' - values: [getpage_prefetch_discards_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_requests_sent_total - type: counter - help: 'Number of all requests sent to the pageserver (not just GetPage requests)' - values: [pageserver_requests_sent_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_disconnects_total - type: counter - help: 'Number of times that the connection to the pageserver was lost' - values: [pageserver_disconnects_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_send_flushes_total - type: counter - help: 'Number of flushes to the pageserver connection' - values: [pageserver_send_flushes_total] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_bucket - type: counter - help: 'Histogram buckets of getpage request latency' - key_labels: - - bucket_le - values: [value] - query_ref: getpage_wait_seconds_buckets - -# DEPRECATED -- metric_name: lfc_approximate_working_set_size - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] - query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration] - values: [size] - # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection - # of durations in a pretty-printed form. - query: | - select - x as duration, - neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size - from - (values ('5m'),('15m'),('1h')) as t (x); - -- metric_name: compute_current_lsn - type: gauge - help: 'Current LSN of the database' - key_labels: - values: [lsn] - query: | - select - case - when pg_catalog.pg_is_in_recovery() - then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - else (pg_current_wal_lsn() - '0/0')::FLOAT8 - end as lsn; - -- metric_name: compute_receive_lsn - type: gauge - help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' - key_labels: - values: [lsn] - query: | - SELECT - CASE - WHEN pg_catalog.pg_is_in_recovery() - THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 - ELSE 0 - END AS lsn; - -- metric_name: replication_delay_bytes - type: gauge - help: 'Bytes between received and replayed LSN' - key_labels: - values: [replication_delay_bytes] - # We use a GREATEST call here because this calculation can be negative. - # The calculation is not atomic, meaning after we've gotten the receive - # LSN, the replay LSN may have advanced past the receive LSN we - # are using for the calculation. - query: | - SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - -- metric_name: replication_delay_seconds - type: gauge - help: 'Time since last LSN was replayed' - key_labels: - values: [replication_delay_seconds] - query: | - SELECT - CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) - END AS replication_delay_seconds; - -- metric_name: checkpoints_req - type: gauge - help: 'Number of requested checkpoints' - key_labels: - values: [checkpoints_req] - query: | - SELECT checkpoints_req FROM pg_stat_bgwriter; - -- metric_name: checkpoints_timed - type: gauge - help: 'Number of scheduled checkpoints' - key_labels: - values: [checkpoints_timed] - query: | - SELECT checkpoints_timed FROM pg_stat_bgwriter; - -- metric_name: compute_logical_snapshot_files - type: gauge - help: 'Number of snapshot files in pg_logical/snapshot' - key_labels: - - timeline_id - values: [num_logical_snapshot_files] - query: | - SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, - -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These - -- temporary snapshot files are renamed to the actual snapshot files after they are - -- completely built. We only WAL-log the completely built snapshot files. - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; - -# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. -# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. - -# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. -- metric_name: logical_slot_restart_lsn - type: gauge - help: 'restart_lsn of logical slots' - key_labels: - - slot_name - values: [restart_lsn] - query: | - select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn - from pg_replication_slots - where slot_type = 'logical'; - -- metric_name: compute_subscriptions_count - type: gauge - help: 'Number of logical replication subscriptions grouped by enabled/disabled' - key_labels: - - enabled - values: [subscriptions_count] - query: | - select subenabled::text as enabled, count(*) as subscriptions_count - from pg_subscription - group by subenabled; - -- metric_name: retained_wal - type: gauge - help: 'Retained WAL in inactive replication slots' - key_labels: - - slot_name - values: [retained_wal] - query: | - SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal - FROM pg_replication_slots - WHERE active = false; - -- metric_name: wal_is_lost - type: gauge - help: 'Whether or not the replication slot wal_status is lost' - key_labels: - - slot_name - values: [wal_is_lost] - query: | - SELECT slot_name, - CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost - FROM pg_replication_slots; - -queries: - - query_name: neon_perf_counters - query: | - WITH c AS ( - SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters - ) - SELECT d.* - FROM pg_catalog.jsonb_to_record((select jb from c)) as d( - getpage_wait_seconds_count numeric, - getpage_wait_seconds_sum numeric, - getpage_prefetch_requests_total numeric, - getpage_sync_requests_total numeric, - getpage_prefetch_misses_total numeric, - getpage_prefetch_discards_total numeric, - pageserver_requests_sent_total numeric, - pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric - ); - - - query_name: getpage_wait_seconds_buckets - query: | - SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet new file mode 100644 index 0000000000..e248172a3d --- /dev/null +++ b/compute/etc/neon_collector_autoscaling.jsonnet @@ -0,0 +1,11 @@ +{ + collector_name: 'neon_collector_autoscaling', + metrics: [ + import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + ], +} diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml deleted file mode 100644 index 5616264eba..0000000000 --- a/compute/etc/neon_collector_autoscaling.yml +++ /dev/null @@ -1,55 +0,0 @@ -collector_name: neon_collector_autoscaling -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration_seconds] - values: [size] - # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set - # size looking back 1..60 minutes, labeled with the number of minutes. - query: | - select - x::text as duration_seconds, - neon.approximate_working_set_size_seconds(x) as size - from - (select generate_series * 60 as x from generate_series(1, 60)) as t (x); diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet new file mode 100644 index 0000000000..1e3665ac47 --- /dev/null +++ b/compute/etc/sql_exporter.jsonnet @@ -0,0 +1,40 @@ +function(collector_file, application_name='sql_exporter') { + // Configuration for sql_exporter for autoscaling-agent + // Global defaults. + global: { + // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: '10s', + // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: '500ms', + // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: '0s', + // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + // as will concurrent scrapes. + max_connections: 1, + // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + // always be the same as max_connections. + max_idle_connections: 1, + // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + // If 0, connections are not closed due to a connection's age. + max_connection_lifetime: '5m', + }, + + // The target to monitor and the collectors to execute on it. + target: { + // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + // the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]), + + // Collectors (referenced by name) to execute on the target. + // Glob patterns are supported (see for syntax). + collectors: [ + 'neon_collector_autoscaling', + ], + }, + + // Collector files specifies a list of globs. One collector definition is read from each matching file. + // Glob patterns are supported (see for syntax). + collector_files: [ + collector_file, + ], +} diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml deleted file mode 100644 index 139d04468a..0000000000 --- a/compute/etc/sql_exporter.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector.yml" diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet new file mode 100644 index 0000000000..8697f8af3b --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'checkpoints_req', + type: 'gauge', + help: 'Number of requested checkpoints', + key_labels: null, + values: [ + 'checkpoints_req', + ], + query: importstr 'sql_exporter/checkpoints_req.sql', +} diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql new file mode 100644 index 0000000000..eb8427c883 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -0,0 +1 @@ +SELECT checkpoints_req FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet new file mode 100644 index 0000000000..9f0b742400 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'checkpoints_timed', + type: 'gauge', + help: 'Number of scheduled checkpoints', + key_labels: null, + values: [ + 'checkpoints_timed', + ], + query: importstr 'sql_exporter/checkpoints_timed.sql', +} diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql new file mode 100644 index 0000000000..c50853134c --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -0,0 +1 @@ +SELECT checkpoints_timed FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet new file mode 100644 index 0000000000..ccff161358 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_current_lsn', + type: 'gauge', + help: 'Current LSN of the database', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_current_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql new file mode 100644 index 0000000000..be02b8a094 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet new file mode 100644 index 0000000000..212f079ccf --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_logical_snapshot_files', + type: 'gauge', + help: 'Number of snapshot files in pg_logical/snapshot', + key_labels: [ + 'timeline_id', + ], + values: [ + 'num_logical_snapshot_files', + ], + query: importstr 'sql_exporter/compute_logical_snapshot_files.sql', +} diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql new file mode 100644 index 0000000000..f2454235b7 --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -0,0 +1,7 @@ +SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. + -- These temporary snapshot files are renamed to the actual snapshot files + -- after they are completely built. We only WAL-log the completely built + -- snapshot files + (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet new file mode 100644 index 0000000000..eb68a77ec2 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_receive_lsn', + type: 'gauge', + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_receive_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql new file mode 100644 index 0000000000..318b31ab41 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet new file mode 100644 index 0000000000..e1575da397 --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_subscriptions_count', + type: 'gauge', + help: 'Number of logical replication subscriptions grouped by enabled/disabled', + key_labels: [ + 'enabled', + ], + values: [ + 'subscriptions_count', + ], + query: importstr 'sql_exporter/compute_subscriptions_count.sql', +} diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql new file mode 100644 index 0000000000..50740cb5df --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -0,0 +1 @@ +SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet new file mode 100644 index 0000000000..9f94db67a9 --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'connection_counts', + type: 'gauge', + help: 'Connection counts', + key_labels: [ + 'datname', + 'state', + ], + values: [ + 'count', + ], + query: importstr 'sql_exporter/connection_counts.sql', +} diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql new file mode 100644 index 0000000000..6824480fdb --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -0,0 +1 @@ +SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet new file mode 100644 index 0000000000..6e08d5fb87 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'db_total_size', + type: 'gauge', + help: 'Size of all databases', + key_labels: null, + values: [ + 'total', + ], + query: importstr 'sql_exporter/db_total_size.sql', +} diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql new file mode 100644 index 0000000000..9cbbdfd8a3 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -0,0 +1 @@ +SELECT sum(pg_database_size(datname)) AS total FROM pg_database; diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet new file mode 100644 index 0000000000..935e35d2e4 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_discards_total', + type: 'counter', + help: 'Number of prefetch responses issued but not used', + values: [ + 'getpage_prefetch_discards_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet new file mode 100644 index 0000000000..b9a9632105 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_misses_total', + type: 'counter', + help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read", + values: [ + 'getpage_prefetch_misses_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet new file mode 100644 index 0000000000..75fdb6717b --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_requests_total', + type: 'counter', + help: 'Number of getpage issued for prefetching', + values: [ + 'getpage_prefetch_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet new file mode 100644 index 0000000000..f3a1e6b339 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_sync_requests_total', + type: 'counter', + help: 'Number of synchronous getpage issued', + values: [ + 'getpage_sync_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..2adda2ad03 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'getpage_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of getpage request latency', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql new file mode 100644 index 0000000000..b4a6bc1560 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..d2326974fc --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_count', + type: 'counter', + help: 'Number of getpage requests', + values: [ + 'getpage_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..844c8419ff --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_sum', + type: 'counter', + help: 'Time spent in getpage requests', + values: [ + 'getpage_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet new file mode 100644 index 0000000000..78859ce60d --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet @@ -0,0 +1,12 @@ +// DEPRECATED + +{ + metric_name: 'lfc_approximate_working_set_size', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: null, + values: [ + 'approximate_working_set_size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql new file mode 100644 index 0000000000..de509ebb47 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql @@ -0,0 +1 @@ +SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet new file mode 100644 index 0000000000..a54deca467 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration_seconds', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql new file mode 100644 index 0000000000..35fa42c34c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "internal" / "machine-readable" version. This outputs the +-- working set size looking back 1..60 minutes, labeled with the number of +-- minutes. + +SELECT + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) AS size +FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet new file mode 100644 index 0000000000..4970bd2c7f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql new file mode 100644 index 0000000000..46c7d1610c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "public" / "human-readable" version. Here, we supply a +-- small selection of durations in a pretty-printed form. + +SELECT + x AS duration, + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + VALUES ('5m'), ('15m'), ('1h') + ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet new file mode 100644 index 0000000000..4cbbd76621 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_cache_size_limit', + type: 'gauge', + help: 'LFC cache size limit in bytes', + key_labels: null, + values: [ + 'lfc_cache_size_limit', + ], + query: importstr 'sql_exporter/lfc_cache_size_limit.sql', +} diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql new file mode 100644 index 0000000000..378904c1fe --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -0,0 +1 @@ +SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet new file mode 100644 index 0000000000..4a0b7671bf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_hits', + type: 'gauge', + help: 'lfc_hits', + key_labels: null, + values: [ + 'lfc_hits', + ], + query: importstr 'sql_exporter/lfc_hits.sql', +} diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql new file mode 100644 index 0000000000..2e14f5c73c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits'; diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet new file mode 100644 index 0000000000..302998d04f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_misses', + type: 'gauge', + help: 'lfc_misses', + key_labels: null, + values: [ + 'lfc_misses', + ], + query: importstr 'sql_exporter/lfc_misses.sql', +} diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql new file mode 100644 index 0000000000..27ed4ecf86 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses'; diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet new file mode 100644 index 0000000000..23891dadaf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used', + type: 'gauge', + help: 'LFC chunks used (chunk = 1MB)', + key_labels: null, + values: [ + 'lfc_used', + ], + query: importstr 'sql_exporter/lfc_used.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql new file mode 100644 index 0000000000..4f01545f30 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used'; diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet new file mode 100644 index 0000000000..6a22ee1dd9 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_writes', + type: 'gauge', + help: 'lfc_writes', + key_labels: null, + values: [ + 'lfc_writes', + ], + query: importstr 'sql_exporter/lfc_writes.sql', +} diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql new file mode 100644 index 0000000000..37c9abc9cf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes'; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet new file mode 100644 index 0000000000..8ef31b5d8d --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet @@ -0,0 +1,15 @@ +// Number of slots is limited by max_replication_slots, so collecting position +// for all of them shouldn't be bad. + +{ + metric_name: 'logical_slot_restart_lsn', + type: 'gauge', + help: 'restart_lsn of logical slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'restart_lsn', + ], + query: importstr 'sql_exporter/logical_slot_restart_lsn.sql', +} diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql new file mode 100644 index 0000000000..1b1c038501 --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -0,0 +1,3 @@ +SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn +FROM pg_replication_slots +WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet new file mode 100644 index 0000000000..1352fb77ee --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'max_cluster_size', + type: 'gauge', + help: 'neon.max_cluster_size setting', + key_labels: null, + values: [ + 'max_cluster_size', + ], + query: importstr 'sql_exporter/max_cluster_size.sql', +} diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql new file mode 100644 index 0000000000..2d2355a9a7 --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -0,0 +1 @@ +SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql new file mode 100644 index 0000000000..58998907a0 --- /dev/null +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -0,0 +1,13 @@ +WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) + +SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric +); diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet new file mode 100644 index 0000000000..5ad9ba078e --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_disconnects_total', + type: 'counter', + help: 'Number of times that the connection to the pageserver was lost', + values: [ + 'pageserver_disconnects_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet new file mode 100644 index 0000000000..c191e2467f --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_requests_sent_total', + type: 'counter', + help: 'Number of all requests sent to the pageserver (not just GetPage requests)', + values: [ + 'pageserver_requests_sent_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet new file mode 100644 index 0000000000..9fa5f77758 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_send_flushes_total', + type: 'counter', + help: 'Number of flushes to the pageserver connection', + values: [ + 'pageserver_send_flushes_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet new file mode 100644 index 0000000000..46ea2f4192 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet @@ -0,0 +1,18 @@ +{ + metric_name: 'pg_stats_userdb', + type: 'gauge', + help: 'Stats for several oldest non-system dbs', + key_labels: [ + 'datname', + ], + value_label: 'kind', + values: [ + 'db_size', + 'deadlocks', + // Rows + 'inserted', + 'updated', + 'deleted', + ], + query: importstr 'sql_exporter/pg_stats_userdb.sql', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql new file mode 100644 index 0000000000..00ada87370 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -0,0 +1,10 @@ +-- We export stats for 10 non-system databases. Without this limit it is too +-- easy to abuse the system by creating lots of databases. + +SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, + tup_updated AS updated, tup_deleted AS deleted, datname +FROM pg_stat_database +WHERE datname IN ( + SELECT datname FROM pg_database + WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 +); diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet new file mode 100644 index 0000000000..3e5bb6af1f --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_bytes', + type: 'gauge', + help: 'Bytes between received and replayed LSN', + key_labels: null, + values: [ + 'replication_delay_bytes', + ], + query: importstr 'sql_exporter/replication_delay_bytes.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql new file mode 100644 index 0000000000..60a6981acd --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -0,0 +1,6 @@ +-- We use a GREATEST call here because this calculation can be negative. The +-- calculation is not atomic, meaning after we've gotten the receive LSN, the +-- replay LSN may have advanced past the receive LSN we are using for the +-- calculation. + +SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet new file mode 100644 index 0000000000..d3f2c21b54 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_seconds', + type: 'gauge', + help: 'Time since last LSN was replayed', + key_labels: null, + values: [ + 'replication_delay_seconds', + ], + query: importstr 'sql_exporter/replication_delay_seconds.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql new file mode 100644 index 0000000000..a76809ad74 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -0,0 +1,5 @@ +SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet new file mode 100644 index 0000000000..f9eff5faa5 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'retained_wal', + type: 'gauge', + help: 'Retained WAL in inactive replication slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'retained_wal', + ], + query: importstr 'sql_exporter/retained_wal.sql', +} diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql new file mode 100644 index 0000000000..6c58359461 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -0,0 +1,5 @@ +SELECT + slot_name, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal +FROM pg_replication_slots +WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet new file mode 100644 index 0000000000..3cd25f4b39 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'wal_is_lost', + type: 'gauge', + help: 'Whether or not the replication slot wal_status is lost', + key_labels: [ + 'slot_name', + ], + values: [ + 'wal_is_lost', + ], + query: importstr 'sql_exporter/wal_is_lost.sql', +} diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql new file mode 100644 index 0000000000..5521270851 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -0,0 +1,7 @@ +SELECT + slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost +FROM pg_replication_slots; diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml deleted file mode 100644 index 044557233e..0000000000 --- a/compute/etc/sql_exporter_autoscaling.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter for autoscaling-agent -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector_autoscaling] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector_autoscaling.yml"