# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. --- commands: - name: cgconfigparser user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for # running it as root. - name: chmod-resize-swap user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/resize-swap' - name: pgbouncer user: postgres sysvInitAction: respawn shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' - name: postgres-exporter user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' - name: sql-exporter user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' - name: sql-exporter-autoscaling user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-resize-swap content: | # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap - filename: pgbouncer.ini content: | [databases] *=host=localhost port=5432 auth_user=cloud_admin [pgbouncer] listen_port=6432 listen_addr=0.0.0.0 auth_type=scram-sha-256 auth_user=cloud_admin auth_dbname=postgres client_tls_sslmode=disable server_tls_sslmode=disable pool_mode=transaction max_client_conn=10000 default_pool_size=64 max_prepared_statements=0 admin_users=postgres unix_socket_dir=/tmp/ unix_socket_mode=0777 - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes group neon-postgres { perm { admin { uid = postgres; } task { gid = users; } } memory {} } - filename: sql_exporter.yml content: | # Configuration for sql_exporter # Global defaults. global: # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. scrape_timeout: 10s # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. scrape_timeout_offset: 500ms # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. min_interval: 0s # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, # as will concurrent scrapes. max_connections: 1 # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should # always be the same as max_connections. max_idle_connections: 1 # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. # If 0, connections are not closed due to a connection's age. max_connection_lifetime: 5m # The target to monitor and the collectors to execute on it. target: # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) # the schema gets dropped or replaced to match the driver expected DSN format. data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' # Collectors (referenced by name) to execute on the target. # Glob patterns are supported (see for syntax). collectors: [neon_collector] # Collector files specifies a list of globs. One collector definition is read from each matching file. # Glob patterns are supported (see for syntax). collector_files: - "neon_collector.yml" - filename: sql_exporter_autoscaling.yml content: | # Configuration for sql_exporter for autoscaling-agent # Global defaults. global: # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. scrape_timeout: 10s # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. scrape_timeout_offset: 500ms # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. min_interval: 0s # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, # as will concurrent scrapes. max_connections: 1 # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should # always be the same as max_connections. max_idle_connections: 1 # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. # If 0, connections are not closed due to a connection's age. max_connection_lifetime: 5m # The target to monitor and the collectors to execute on it. target: # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) # the schema gets dropped or replaced to match the driver expected DSN format. data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' # Collectors (referenced by name) to execute on the target. # Glob patterns are supported (see for syntax). collectors: [neon_collector_autoscaling] # Collector files specifies a list of globs. One collector definition is read from each matching file. # Glob patterns are supported (see for syntax). collector_files: - "neon_collector_autoscaling.yml" - filename: neon_collector.yml content: | collector_name: neon_collector metrics: - metric_name: lfc_misses type: gauge help: 'lfc_misses' key_labels: values: [lfc_misses] query: | select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - metric_name: lfc_used type: gauge help: 'LFC chunks used (chunk = 1MB)' key_labels: values: [lfc_used] query: | select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - metric_name: lfc_hits type: gauge help: 'lfc_hits' key_labels: values: [lfc_hits] query: | select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - metric_name: lfc_writes type: gauge help: 'lfc_writes' key_labels: values: [lfc_writes] query: | select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - metric_name: lfc_cache_size_limit type: gauge help: 'LFC cache size limit in bytes' key_labels: values: [lfc_cache_size_limit] query: | select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - metric_name: connection_counts type: gauge help: 'Connection counts' key_labels: - datname - state values: [count] query: | select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; - metric_name: pg_stats_userdb type: gauge help: 'Stats for several oldest non-system dbs' key_labels: - datname value_label: kind values: - db_size - deadlocks # Rows - inserted - updated - deleted # We export stats for 10 non-system database. Without this limit # it is too easy to abuse the system by creating lots of databases. query: | select pg_database_size(datname) as db_size, deadlocks, tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, datname from pg_stat_database where datname IN ( select datname from pg_database where datname <> 'postgres' and not datistemplate order by oid limit 10 ); - metric_name: max_cluster_size type: gauge help: 'neon.max_cluster_size setting' key_labels: values: [max_cluster_size] query: | select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; - metric_name: db_total_size type: gauge help: 'Size of all databases' key_labels: values: [total] query: | select sum(pg_database_size(datname)) as total from pg_database; # DEPRECATED - metric_name: lfc_approximate_working_set_size type: gauge help: 'Approximate working set size in pages of 8192 bytes' key_labels: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; - metric_name: lfc_approximate_working_set_size_windows type: gauge help: 'Approximate working set size in pages of 8192 bytes' key_labels: [duration] values: [size] # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection # of durations in a pretty-printed form. query: | select x as duration, neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size from (values ('5m'),('15m'),('1h')) as t (x); - metric_name: compute_current_lsn type: gauge help: 'Current LSN of the database' key_labels: values: [lsn] query: | select case when pg_catalog.pg_is_in_recovery() then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 else (pg_current_wal_lsn() - '0/0')::FLOAT8 end as lsn; - metric_name: compute_receive_lsn type: gauge help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' key_labels: values: [lsn] query: | SELECT CASE WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 ELSE 0 END AS lsn; - metric_name: replication_delay_bytes type: gauge help: 'Bytes between received and replayed LSN' key_labels: values: [replication_delay_bytes] # We use a GREATEST call here because this calculation can be negative. # The calculation is not atomic, meaning after we've gotten the receive # LSN, the replay LSN may have advanced past the receive LSN we # are using for the calculation. query: | SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - metric_name: replication_delay_seconds type: gauge help: 'Time since last LSN was replayed' key_labels: values: [replication_delay_seconds] query: | SELECT CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; - metric_name: checkpoints_req type: gauge help: 'Number of requested checkpoints' key_labels: values: [checkpoints_req] query: | SELECT checkpoints_req FROM pg_stat_bgwriter; - metric_name: checkpoints_timed type: gauge help: 'Number of scheduled checkpoints' key_labels: values: [checkpoints_timed] query: | SELECT checkpoints_timed FROM pg_stat_bgwriter; - metric_name: compute_logical_snapshot_files type: gauge help: 'Number of snapshot files in pg_logical/snapshot' key_labels: - timeline_id values: [num_logical_snapshot_files] query: | SELECT (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These -- temporary snapshot files are renamed to the actual snapshot files after they are -- completely built. We only WAL-log the completely built snapshot files. (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files; # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. - metric_name: logical_slot_restart_lsn type: gauge help: 'restart_lsn of logical slots' key_labels: - slot_name values: [restart_lsn] query: | select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn from pg_replication_slots where slot_type = 'logical'; - metric_name: compute_subscriptions_count type: gauge help: 'Number of logical replication subscriptions grouped by enabled/disabled' key_labels: - enabled values: [subscriptions_count] query: | select subenabled::text as enabled, count(*) as subscriptions_count from pg_subscription group by subenabled; - metric_name: retained_wal type: gauge help: 'Retained WAL in inactive replication slots' key_labels: - slot_name values: [retained_wal] query: | SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal FROM pg_replication_slots WHERE active = false; - metric_name: wal_is_lost type: gauge help: 'Whether or not the replication slot wal_status is lost' key_labels: - slot_name values: [wal_is_lost] query: | SELECT slot_name, CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost FROM pg_replication_slots; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling metrics: - metric_name: lfc_misses type: gauge help: 'lfc_misses' key_labels: values: [lfc_misses] query: | select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - metric_name: lfc_used type: gauge help: 'LFC chunks used (chunk = 1MB)' key_labels: values: [lfc_used] query: | select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - metric_name: lfc_hits type: gauge help: 'lfc_hits' key_labels: values: [lfc_hits] query: | select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - metric_name: lfc_writes type: gauge help: 'lfc_writes' key_labels: values: [lfc_writes] query: | select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - metric_name: lfc_cache_size_limit type: gauge help: 'LFC cache size limit in bytes' key_labels: values: [lfc_cache_size_limit] query: | select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - metric_name: lfc_approximate_working_set_size_windows type: gauge help: 'Approximate working set size in pages of 8192 bytes' key_labels: [duration_seconds] values: [size] # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set # size looking back 1..60 minutes, labeled with the number of minutes. query: | select x::text as duration_seconds, neon.approximate_working_set_size_seconds(x) as size from (select generate_series * 60 as x from generate_series(1, 60)) as t (x); build: | # Build cgroup-tools # # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. FROM debian:bullseye-slim as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ && apt install --no-install-recommends -y \ git \ ca-certificates \ automake \ cmake \ make \ gcc \ byacc \ flex \ libtool \ libpam0g-dev \ && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ && INSTALL_DIR="/libcgroup-install" \ && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ && cd libcgroup \ # extracted from bootstrap.sh, with modified flags: && (test -d m4 || mkdir m4) \ && autoreconf -fi \ && rm -rf autom4te.cache \ && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ # actually build the thing... && make install FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter FROM burningalchemist/sql_exporter:0.13 AS sql-exporter # Build pgbouncer # FROM debian:bullseye-slim AS pgbouncer RUN set -e \ && apt-get update \ && apt-get install -y \ build-essential \ git \ libevent-dev \ libtool \ pkg-config # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) ENV PGBOUNCER_TAG=pgbouncer_1_22_1 RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= merge: | # tweak nofile limits RUN set -e \ && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ && test ! -e /etc/security || ( \ echo '* - nofile 1048576' >>/etc/security/limits.conf \ && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) # Allow postgres user (compute_ctl) to run swap resizer. # Need to install sudo in order to allow this. # # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. RUN set -e \ && apt update \ && apt install --no-install-recommends -y \ sudo \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini COPY sql_exporter.yml /etc/sql_exporter.yml COPY neon_collector.yml /etc/neon_collector.yml COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ && chmod 0666 /etc/pgbouncer.ini \ && chmod 0644 /etc/cgconfig.conf \ && chmod 0644 /etc/sql_exporter.yml \ && chmod 0644 /etc/neon_collector.yml \ && chmod 0644 /etc/sql_exporter_autoscaling.yml \ && chmod 0644 /etc/neon_collector_autoscaling.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer