Increase range of expected value for working set approximation test

Add jsonnetfmt targets
This should make it a little bit easier for people wanting to check if their files are formated correctly. Has the added bonus of making the CI check simpler as well. Signed-off-by: Tristan Partin <tristan@neon.tech>
2026-05-22 15:41:15 +00:00 · 2024-10-16 18:59:19 +03:00 · 2024-10-15 20:01:13 -05:00 · 2024-10-15 16:30:31 -05:00 · 2024-10-15 23:13:31 +02:00 · 2024-10-15 16:35:21 -04:00
138 changed files with 2467 additions and 1002 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -183,7 +183,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

    - name: Store Allure test stat in the DB (new)
      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -88,7 +88,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -124,28 +124,28 @@ jobs:
        uses: actions/cache@v4
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v4
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v4
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
        uses: actions/cache@v4
        with:
          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -19,9 +19,16 @@ defaults:
  run:
    shell: bash -euo pipefail {0}

-concurrency:
-  group: build-build-tools-image-${{ inputs.image-tag }}
-  cancel-in-progress: false
+# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image
+# for the same tag in parallel workflow runs, and queue them to be skipped once we have
+# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected.
+# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs.
+#
+# Ref https://github.com/orgs/community/discussions/41518
+#
+# concurrency:
+#   group: build-build-tools-image-${{ inputs.image-tag }}
+#   cancel-in-progress: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -36,6 +43,7 @@ jobs:

    strategy:
      matrix:
+        debian-version: [ bullseye, bookworm ]
        arch: [ x64, arm64 ]

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -74,22 +82,22 @@ jobs:

      - uses: docker/build-push-action@v6
        with:
+          file: Dockerfile.build-tools
          context: .
          provenance: false
          push: true
          pull: true
-          file: Dockerfile.build-tools
-          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
-          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+          build-args: |
+            DEBIAN_VERSION=${{ matrix.debian-version }}
+          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }}
+          tags: |
+            neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }}

  merge-images:
    needs: [ build-image ]
    runs-on: ubuntu-22.04

-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
    steps:
      - uses: docker/login-action@v3
        with:
@@ -97,7 +105,17 @@ jobs:
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

      - name: Create multi-arch image
+        env:
+          DEFAULT_DEBIAN_VERSION: bullseye
+          IMAGE_TAG: ${{ inputs.image-tag }}
        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
-                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
-                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
+          for debian_version in bullseye bookworm; do
+            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}")
+            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+              tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}")
+            fi
+
+            docker buildx imagetools create "${tags[@]}" \
+                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \
+                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64
+          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -92,7 +92,7 @@ jobs:
    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -106,7 +106,7 @@ jobs:
        uses: actions/cache@v4
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -120,6 +120,24 @@ jobs:
      - name: Run mypy to check types
        run: poetry run mypy .

+  check-codestyle-jsonnet:
+    needs: [ check-permissions, build-build-tools-image ]
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Check Jsonnet code formatting
+        run: |
+          make -C compute jsonnetfmt-test
+
  # Check that the vendor/postgres-* submodules point to the
  # corresponding REL_*_STABLE_neon branches.
  check-submodules:
@@ -181,7 +199,7 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -261,7 +279,7 @@ jobs:
    uses: ./.github/workflows/_build-and-test-locally.yml
    with:
      arch: ${{ matrix.arch }}
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
@@ -276,7 +294,7 @@ jobs:
    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -289,7 +307,7 @@ jobs:
        uses: actions/cache@v4
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -309,7 +327,7 @@ jobs:
    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -367,7 +385,7 @@ jobs:

    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -415,7 +433,7 @@ jobs:
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -559,15 +577,16 @@ jobs:
            ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm
+            DEBIAN_VERSION=bookworm
          provenance: false
          push: true
          pull: true
          file: Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }}
          tags: |
-            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }}

  neon-image:
    needs: [ neon-image-arch, tag ]
@@ -582,8 +601,9 @@ jobs:
      - name: Create multi-arch image
        run: |
          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+                                          -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64

      - uses: docker/login-action@v3
        with:
@@ -604,17 +624,16 @@ jobs:
        version:
          # Much data was already generated on old PG versions with bullseye's
          # libraries, the locales of which can cause data incompatibilities.
-          # However, new PG versions should check if they can be built on newer
-          # images, as that reduces the support burden of old and ancient
-          # distros.
+          # However, new PG versions should be build on newer images,
+          # as that reduces the support burden of old and ancient distros.
          - pg: v14
-            debian: bullseye-slim
+            debian: bullseye
          - pg: v15
-            debian: bullseye-slim
+            debian: bullseye
          - pg: v16
-            debian: bullseye-slim
+            debian: bullseye
          - pg: v17
-            debian: bookworm-slim
+            debian: bookworm
        arch: [ x64, arm64 ]

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -659,16 +678,16 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
          pull: true
          file: compute/Dockerfile.compute-node
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}

      - name: Build neon extensions test image
        if: matrix.version.pg == 'v16'
@@ -679,17 +698,17 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
          pull: true
          file: compute/Dockerfile.compute-node
          target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
+            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
@@ -704,14 +723,16 @@ jobs:
          build-args: |
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
          pull: true
          file: compute/Dockerfile.compute-node
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
@@ -719,7 +740,16 @@ jobs:

    strategy:
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+            debian: bullseye
+          - pg: v15
+            debian: bullseye
+          - pg: v16
+            debian: bullseye
+          - pg: v17
+            debian: bookworm

    steps:
      - uses: docker/login-action@v3
@@ -729,23 +759,26 @@ jobs:

      - name: Create multi-arch compute-node image
        run: |
-          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

      - name: Create multi-arch neon-test-extensions image
-        if: matrix.version == 'v16'
+        if: matrix.version.pg == 'v16'
        run: |
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

      - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v17'
+        if: matrix.version.pg == 'v16'
        run: |
          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

      - uses: docker/login-action@v3
        with:
@@ -753,13 +786,13 @@ jobs:
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v17'
+        if: matrix.version.pg == 'v16'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -770,7 +803,16 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+            debian: bullseye
+          - pg: v15
+            debian: bullseye
+          - pg: v16
+            debian: bullseye
+          - pg: v17
+            debian: bookworm
    env:
      VM_BUILDER_VERSION: v0.35.0

@@ -792,18 +834,18 @@ jobs:
      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
-          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

      - name: Build vm image
        run: |
          ./vm-builder \
-            -spec=compute/vm-image-spec.yaml \
-            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+            -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
+            -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

      - name: Pushing vm-compute-node image
        run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -155,7 +155,7 @@ jobs:
      github.ref_name == 'main'
    runs-on: [ self-hosted, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -55,7 +55,7 @@ jobs:
    runs-on: ubuntu-22.04

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -150,7 +150,7 @@ jobs:
    runs-on: ubuntu-22.04

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -71,7 +71,6 @@ jobs:

    steps:
      - uses: docker/login-action@v3
-
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -94,8 +93,22 @@ jobs:
          az acr login --name=neoneastus2

      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
+        env:
+          DEFAULT_DEBIAN_VERSION: bullseye
        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
-                                          -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
+          for debian_version in bullseye bookworm; do
+            tags=()
+
+            tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
+
+            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+              tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
+              tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
+              tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
+            fi
+
+            docker buildx imagetools create "${tags[@]}" \
+                                              neondatabase/build-tools:${FROM_TAG}-${debian_version}
+          done
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
 "equivalent",
 "hashbrown 0.14.5",
+ "serde",
 ]

 [[package]]
@@ -2794,9 +2795,9 @@ dependencies = [

 [[package]]
 name = "itoa"
-version = "1.0.6"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"

 [[package]]
 name = "jobserver"
@@ -4296,6 +4297,7 @@ dependencies = [
 "indexmap 2.0.1",
 "ipnet",
 "itertools 0.10.5",
+ "itoa",
 "jose-jwa",
 "jose-jwk",
 "lasso",
@@ -7307,6 +7309,7 @@ dependencies = [
 "hyper 1.4.1",
 "hyper-util",
 "indexmap 1.9.3",
+ "indexmap 2.0.1",
 "itertools 0.12.1",
 "lazy_static",
 "libc",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,7 @@ indexmap = "2"
 indoc = "2"
 ipnet = "2.9.0"
 itertools = "0.10"
+itoa = "1.0.11"
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
--- a/4
+++ b/4
@@ -7,6 +7,8 @@ ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG DEFAULT_PG_VERSION=17
 ARG STABLE_PG_VERSION=16
+ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -57,7 +59,7 @@ RUN set -e \

 # Build final image
 #
-FROM debian:bullseye-slim
+FROM debian:${DEBIAN_FLAVOR}
 ARG DEFAULT_PG_VERSION
 WORKDIR /data

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,12 +1,7 @@
-FROM debian:bullseye-slim
+ARG DEBIAN_VERSION=bullseye

-# Use ARG as a build-time environment variable here to allow.
-# It's not supposed to be set outside.
-# Alternatively it can be obtained using the following command
-# ```
-# . /etc/os-release && echo "${VERSION_CODENAME}"
-# ```
-ARG DEBIAN_VERSION_CODENAME=bullseye
+FROM debian:${DEBIAN_VERSION}-slim
+ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
@@ -32,6 +27,7 @@ RUN set -e \
        gnupg \
        gzip \
        jq \
+        jsonnet \
        libcurl4-openssl-dev \
        libbz2-dev \
        libffi-dev \
@@ -42,14 +38,14 @@ RUN set -e \
        libseccomp-dev \
        libsqlite3-dev \
        libssl-dev \
-        libstdc++-10-dev \
+        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
        libtool \
        libxml2-dev \
        libxmlsec1-dev \
        libxxhash-dev \
        lsof \
        make \
-        netcat \
+        netcat-openbsd \
        net-tools \
        openssh-client \
        parallel \
@@ -78,7 +74,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
@@ -86,7 +82,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

 # Install docker
 RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
-    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
    && apt update \
    && apt install -y docker-ce docker-ce-cli \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
--- a/1
+++ b/1
@@ -291,6 +291,7 @@ postgres-check: \
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean: postgres-clean neon-pg-clean-ext
+	$(MAKE) -C compute clean
 	$(CARGO_CMD_PREFIX) cargo clean

 # This removes everything
--- a/compute/.gitignore
+++ b/compute/.gitignore
@@ -0,0 +1,5 @@
+# sql_exporter config files generated from Jsonnet
+etc/neon_collector.yml
+etc/neon_collector_autoscaling.yml
+etc/sql_exporter.yml
+etc/sql_exporter_autoscaling.yml
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
-ARG DEBIAN_FLAVOR=bullseye-slim
+ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

 #########################################################################################
 #
@@ -11,20 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim
 #
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR AS build-deps
-ARG DEBIAN_FLAVOR
+ARG DEBIAN_VERSION

-RUN case $DEBIAN_FLAVOR in \
+RUN case $DEBIAN_VERSION in \
      # Version-specific installs for Bullseye (PG14-PG16):
      # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
      # Install newer version (3.25) from backports.
-      bullseye*) \
+      bullseye) \
        echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
      ;; \
      # Version-specific installs for Bookworm (PG17):
-      bookworm*) \
+      bookworm) \
        VERSION_INSTALLS="cmake"; \
      ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
    esac && \
    apt update &&  \
    apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
@@ -345,7 +349,7 @@ ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
-# doesn't use releases, last commit f3d82fd - Mar 2, 2023 
+# doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
    echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
@@ -925,8 +929,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
    echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
    esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release
@@ -1091,7 +1095,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
 #########################################################################################

 FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-ARG DEBIAN_FLAVOR

 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

@@ -1102,7 +1105,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu
 #########################################################################################

 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
-ARG DEBIAN_FLAVOR
 RUN set -e \
    && apt-get update \
    && apt-get install --no-install-recommends -y \
@@ -1167,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
+#
+# Preprocess the sql_exporter configuration files
+#
+#########################################################################################
+FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
+
+USER nonroot
+
+COPY --chown=nonroot compute compute
+
+RUN make -C compute

 #########################################################################################
 #
@@ -1257,7 +1271,7 @@ ENV PGDATABASE=postgres
 #
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
-ARG DEBIAN_FLAVOR
+ARG DEBIAN_VERSION
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
@@ -1285,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter

-COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
-COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
-COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
-COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml

 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
@@ -1305,19 +1319,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca


 RUN apt update && \
-    case $DEBIAN_FLAVOR in \
+    case $DEBIAN_VERSION in \
      # Version-specific installs for Bullseye (PG14-PG16):
      # libicu67, locales for collations (including ICU and plpgsql_check)
      # libgdal28, libproj19 for PostGIS
-      bullseye*) \
+      bullseye) \
        VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
      ;; \
      # Version-specific installs for Bookworm (PG17):
      # libicu72, locales for collations (including ICU and plpgsql_check)
      # libgdal32, libproj25 for PostGIS
-      bookworm*) \
+      bookworm) \
        VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
      ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
    esac && \
    apt install --no-install-recommends -y \
        gdb \
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -0,0 +1,45 @@
+jsonnet_files = $(wildcard \
+	etc/*.jsonnet \
+	etc/sql_exporter/*.libsonnet)
+
+.PHONY: all
+all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml
+
+neon_collector.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		etc/neon_collector.jsonnet
+
+neon_collector_autoscaling.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		etc/neon_collector_autoscaling.jsonnet
+
+sql_exporter.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		--tla-str collector_file=neon_collector.yml \
+		etc/sql_exporter.jsonnet
+
+sql_exporter_autoscaling.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		--tla-str collector_file=neon_collector_autoscaling.yml \
+		--tla-str application_name=sql_exporter_autoscaling \
+		etc/sql_exporter.jsonnet
+
+.PHONY: clean
+clean:
+	rm --force \
+		etc/neon_collector.yml \
+		etc/neon_collector_autoscaling.yml \
+		etc/sql_exporter.yml \
+		etc/sql_exporter_autoscaling.yml
+
+.PHONY: jsonnetfmt-test
+jsonnetfmt-test:
+	jsonnetfmt --test $(jsonnet_files)
+
+.PHONY: jsonnetfmt-format
+jsonnetfmt-format:
+	jsonnetfmt --in-place $(jsonnet_files)
--- a/compute/etc/README.md
+++ b/compute/etc/README.md
@@ -0,0 +1,17 @@
+# Compute Configuration
+
+These files are the configuration files for various other pieces of software
+that will be running in the compute alongside Postgres.
+
+## `sql_exporter`
+
+### Adding a `sql_exporter` Metric
+
+We use `sql_exporter` to export various metrics from Postgres. In order to add
+a metric, you will need to create two files: a `libsonnet` and a `sql` file. You
+will then import the `libsonnet` file in one of the collector files, and the
+`sql` file will be imported in the `libsonnet` file.
+
+In the event your statistic is an LSN, you may want to cast it to a `float8`
+because Prometheus only supports floats. It's probably fine because `float8` can
+store integers from `-2^53` to `+2^53` exactly.
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -0,0 +1,51 @@
+{
+  collector_name: 'neon_collector',
+  metrics: [
+    import 'sql_exporter/checkpoints_req.libsonnet',
+    import 'sql_exporter/checkpoints_timed.libsonnet',
+    import 'sql_exporter/compute_current_lsn.libsonnet',
+    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_receive_lsn.libsonnet',
+    import 'sql_exporter/compute_subscriptions_count.libsonnet',
+    import 'sql_exporter/connection_counts.libsonnet',
+    import 'sql_exporter/db_total_size.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/getpage_prefetch_discards_total.libsonnet',
+    import 'sql_exporter/getpage_prefetch_misses_total.libsonnet',
+    import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
+    import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
+    import 'sql_exporter/getpage_sync_requests_total.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
+    import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
+    import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_hits.libsonnet',
+    import 'sql_exporter/lfc_misses.libsonnet',
+    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_writes.libsonnet',
+    import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
+    import 'sql_exporter/max_cluster_size.libsonnet',
+    import 'sql_exporter/pageserver_disconnects_total.libsonnet',
+    import 'sql_exporter/pageserver_requests_sent_total.libsonnet',
+    import 'sql_exporter/pageserver_send_flushes_total.libsonnet',
+    import 'sql_exporter/pageserver_open_requests.libsonnet',
+    import 'sql_exporter/pg_stats_userdb.libsonnet',
+    import 'sql_exporter/replication_delay_bytes.libsonnet',
+    import 'sql_exporter/replication_delay_seconds.libsonnet',
+    import 'sql_exporter/retained_wal.libsonnet',
+    import 'sql_exporter/wal_is_lost.libsonnet',
+  ],
+  queries: [
+    {
+      query_name: 'neon_perf_counters',
+      query: importstr 'sql_exporter/neon_perf_counters.sql',
+    },
+  ],
+}
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -1,331 +0,0 @@
-collector_name: neon_collector
-metrics:
- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
- metric_name: connection_counts
-  type: gauge
-  help: 'Connection counts'
-  key_labels:
-    - datname
-    - state
-  values: [count]
-  query: |
-    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
- metric_name: pg_stats_userdb
-  type: gauge
-  help: 'Stats for several oldest non-system dbs'
-  key_labels:
-    - datname
-  value_label: kind
-  values:
-    - db_size
-    - deadlocks
-    # Rows
-    - inserted
-    - updated
-    - deleted
-  # We export stats for 10 non-system database. Without this limit
-  # it is too easy to abuse the system by creating lots of databases.
-  query: |
-    select pg_database_size(datname) as db_size, deadlocks,
-       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-       datname
-     from pg_stat_database
-     where datname IN (
-       select datname
-       from pg_database
-       where datname <> 'postgres' and not datistemplate
-       order by oid
-       limit 10
-     );
-
- metric_name: max_cluster_size
-  type: gauge
-  help: 'neon.max_cluster_size setting'
-  key_labels:
-  values: [max_cluster_size]
-  query: |
-    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
- metric_name: db_total_size
-  type: gauge
-  help: 'Size of all databases'
-  key_labels:
-  values: [total]
-  query: |
-    select sum(pg_database_size(datname)) as total from pg_database;
-
- metric_name: getpage_wait_seconds_count
-  type: counter
-  help: 'Number of getpage requests'
-  values: [getpage_wait_seconds_count]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_wait_seconds_sum
-  type: counter
-  help: 'Time spent in getpage requests'
-  values: [getpage_wait_seconds_sum]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_prefetch_requests_total
-  type: counter
-  help: 'Number of getpage issued for prefetching'
-  values: [getpage_prefetch_requests_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_sync_requests_total
-  type: counter
-  help: 'Number of synchronous getpage issued'
-  values: [getpage_sync_requests_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_prefetch_misses_total
-  type: counter
-  help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
-  values: [getpage_prefetch_misses_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_prefetch_discards_total
-  type: counter
-  help: 'Number of prefetch responses issued but not used'
-  values: [getpage_prefetch_discards_total]
-  query_ref: neon_perf_counters
-
- metric_name: pageserver_requests_sent_total
-  type: counter
-  help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
-  values: [pageserver_requests_sent_total]
-  query_ref: neon_perf_counters
-
- metric_name: pageserver_disconnects_total
-  type: counter
-  help: 'Number of times that the connection to the pageserver was lost'
-  values: [pageserver_disconnects_total]
-  query_ref: neon_perf_counters
-
- metric_name: pageserver_send_flushes_total
-  type: counter
-  help: 'Number of flushes to the pageserver connection'
-  values: [pageserver_send_flushes_total]
-  query_ref: neon_perf_counters
-
- metric_name: getpage_wait_seconds_bucket
-  type: counter
-  help: 'Histogram buckets of getpage request latency'
-  key_labels:
-      - bucket_le
-  values: [value]
-  query_ref: getpage_wait_seconds_buckets
-
-# DEPRECATED
- metric_name: lfc_approximate_working_set_size
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels:
-  values: [approximate_working_set_size]
-  query: |
-    select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration]
-  values: [size]
-  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-  # of durations in a pretty-printed form.
-  query: |
-    select
-      x as duration,
-      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-    from
-      (values ('5m'),('15m'),('1h')) as t (x);
-
- metric_name: compute_current_lsn
-  type: gauge
-  help: 'Current LSN of the database'
-  key_labels:
-  values: [lsn]
-  query: |
-    select
-      case
-        when pg_catalog.pg_is_in_recovery()
-        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-        else (pg_current_wal_lsn() - '0/0')::FLOAT8
-      end as lsn;
-
- metric_name: compute_receive_lsn
-  type: gauge
-  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-  key_labels:
-  values: [lsn]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_catalog.pg_is_in_recovery()
-        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-        ELSE 0
-      END AS lsn;
-
- metric_name: replication_delay_bytes
-  type: gauge
-  help: 'Bytes between received and replayed LSN'
-  key_labels:
-  values: [replication_delay_bytes]
-  # We use a GREATEST call here because this calculation can be negative.
-  # The calculation is not atomic, meaning after we've gotten the receive
-  # LSN, the replay LSN may have advanced past the receive LSN we
-  # are using for the calculation.
-  query: |
-    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
- metric_name: replication_delay_seconds
-  type: gauge
-  help: 'Time since last LSN was replayed'
-  key_labels:
-  values: [replication_delay_seconds]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-     END AS replication_delay_seconds;
-
- metric_name: checkpoints_req
-  type: gauge
-  help: 'Number of requested checkpoints'
-  key_labels:
-  values: [checkpoints_req]
-  query: |
-    SELECT checkpoints_req FROM pg_stat_bgwriter;
-
- metric_name: checkpoints_timed
-  type: gauge
-  help: 'Number of scheduled checkpoints'
-  key_labels:
-  values: [checkpoints_timed]
-  query: |
-    SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
- metric_name: compute_logical_snapshot_files
-  type: gauge
-  help: 'Number of snapshot files in pg_logical/snapshot'
-  key_labels:
-    - timeline_id
-  values: [num_logical_snapshot_files]
-  query: |
-    SELECT
-      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-      -- temporary snapshot files are renamed to the actual snapshot files after they are
-      -- completely built. We only WAL-log the completely built snapshot files.
-      (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
- metric_name: logical_slot_restart_lsn
-  type: gauge
-  help: 'restart_lsn of logical slots'
-  key_labels:
-    - slot_name
-  values: [restart_lsn]
-  query: |
-    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-    from pg_replication_slots
-    where slot_type = 'logical';
-
- metric_name: compute_subscriptions_count
-  type: gauge
-  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-  key_labels:
-    - enabled
-  values: [subscriptions_count]
-  query: |
-    select subenabled::text as enabled, count(*) as subscriptions_count
-    from pg_subscription
-    group by subenabled;
-
- metric_name: retained_wal
-  type: gauge
-  help: 'Retained WAL in inactive replication slots'
-  key_labels:
-    - slot_name
-  values: [retained_wal]
-  query: |
-    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-    FROM pg_replication_slots
-    WHERE active = false;
-
- metric_name: wal_is_lost
-  type: gauge
-  help: 'Whether or not the replication slot wal_status is lost'
-  key_labels:
-    - slot_name
-  values: [wal_is_lost]
-  query: |
-    SELECT slot_name,
-           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
-    FROM pg_replication_slots;
-
-queries:
-  - query_name: neon_perf_counters
-    query: |
-      WITH c AS (
-        SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
-      )
-      SELECT d.*
-      FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
-          getpage_wait_seconds_count numeric,
-          getpage_wait_seconds_sum numeric,
-          getpage_prefetch_requests_total numeric,
-          getpage_sync_requests_total numeric,
-          getpage_prefetch_misses_total numeric,
-          getpage_prefetch_discards_total numeric,
-          pageserver_requests_sent_total numeric,
-          pageserver_disconnects_total numeric,
-          pageserver_send_flushes_total numeric
-      );
-
-  - query_name: getpage_wait_seconds_buckets
-    query: |
-      SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
--- a/compute/etc/neon_collector_autoscaling.jsonnet
+++ b/compute/etc/neon_collector_autoscaling.jsonnet
@@ -0,0 +1,11 @@
+{
+  collector_name: 'neon_collector_autoscaling',
+  metrics: [
+    import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet',
+    import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_hits.libsonnet',
+    import 'sql_exporter/lfc_misses.libsonnet',
+    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_writes.libsonnet',
+  ],
+}
--- a/compute/etc/neon_collector_autoscaling.yml
+++ b/compute/etc/neon_collector_autoscaling.yml
@@ -1,55 +0,0 @@
-collector_name: neon_collector_autoscaling
-metrics:
- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration_seconds]
-  values: [size]
-  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-  # size looking back 1..60 minutes, labeled with the number of minutes.
-  query: |
-    select
-      x::text as duration_seconds,
-      neon.approximate_working_set_size_seconds(x) as size
-    from
-      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -0,0 +1,40 @@
+function(collector_file, application_name='sql_exporter') {
+  // Configuration for sql_exporter for autoscaling-agent
+  // Global defaults.
+  global: {
+    // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+    scrape_timeout: '10s',
+    // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+    scrape_timeout_offset: '500ms',
+    // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+    min_interval: '0s',
+    // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+    // as will concurrent scrapes.
+    max_connections: 1,
+    // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+    // always be the same as max_connections.
+    max_idle_connections: 1,
+    // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+    // If 0, connections are not closed due to a connection's age.
+    max_connection_lifetime: '5m',
+  },
+
+  // The target to monitor and the collectors to execute on it.
+  target: {
+    // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+    // the schema gets dropped or replaced to match the driver expected DSN format.
+    data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
+
+    // Collectors (referenced by name) to execute on the target.
+    // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+    collectors: [
+      'neon_collector_autoscaling',
+    ],
+  },
+
+  // Collector files specifies a list of globs. One collector definition is read from each matching file.
+  // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collector_files: [
+    collector_file,
+  ],
+}
--- a/compute/etc/sql_exporter.yml
+++ b/compute/etc/sql_exporter.yml
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector.yml"
--- a/compute/etc/sql_exporter/checkpoints_req.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'checkpoints_req',
+  type: 'gauge',
+  help: 'Number of requested checkpoints',
+  key_labels: null,
+  values: [
+    'checkpoints_req',
+  ],
+  query: importstr 'sql_exporter/checkpoints_req.sql',
+}
--- a/compute/etc/sql_exporter/checkpoints_req.sql
+++ b/compute/etc/sql_exporter/checkpoints_req.sql
@@ -0,0 +1 @@
+SELECT checkpoints_req FROM pg_stat_bgwriter;
--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'checkpoints_timed',
+  type: 'gauge',
+  help: 'Number of scheduled checkpoints',
+  key_labels: null,
+  values: [
+    'checkpoints_timed',
+  ],
+  query: importstr 'sql_exporter/checkpoints_timed.sql',
+}
--- a/compute/etc/sql_exporter/checkpoints_timed.sql
+++ b/compute/etc/sql_exporter/checkpoints_timed.sql
@@ -0,0 +1 @@
+SELECT checkpoints_timed FROM pg_stat_bgwriter;
--- a/compute/etc/sql_exporter/compute_current_lsn.libsonnet
+++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_current_lsn',
+  type: 'gauge',
+  help: 'Current LSN of the database',
+  key_labels: null,
+  values: [
+    'lsn',
+  ],
+  query: importstr 'sql_exporter/compute_current_lsn.sql',
+}
--- a/compute/etc/sql_exporter/compute_current_lsn.sql
+++ b/compute/etc/sql_exporter/compute_current_lsn.sql
@@ -0,0 +1,4 @@
+SELECT CASE
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+  ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
+END AS lsn;
--- a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'compute_logical_snapshot_files',
+  type: 'gauge',
+  help: 'Number of snapshot files in pg_logical/snapshot',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'num_logical_snapshot_files',
+  ],
+  query: importstr 'sql_exporter/compute_logical_snapshot_files.sql',
+}
--- a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
--- a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet
+++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_receive_lsn',
+  type: 'gauge',
+  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication',
+  key_labels: null,
+  values: [
+    'lsn',
+  ],
+  query: importstr 'sql_exporter/compute_receive_lsn.sql',
+}
--- a/compute/etc/sql_exporter/compute_receive_lsn.sql
+++ b/compute/etc/sql_exporter/compute_receive_lsn.sql
@@ -0,0 +1,4 @@
+SELECT CASE
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+  ELSE 0
+END AS lsn;
--- a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'compute_subscriptions_count',
+  type: 'gauge',
+  help: 'Number of logical replication subscriptions grouped by enabled/disabled',
+  key_labels: [
+    'enabled',
+  ],
+  values: [
+    'subscriptions_count',
+  ],
+  query: importstr 'sql_exporter/compute_subscriptions_count.sql',
+}
--- a/compute/etc/sql_exporter/compute_subscriptions_count.sql
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql
@@ -0,0 +1 @@
+SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
--- a/compute/etc/sql_exporter/connection_counts.libsonnet
+++ b/compute/etc/sql_exporter/connection_counts.libsonnet
@@ -0,0 +1,13 @@
+{
+  metric_name: 'connection_counts',
+  type: 'gauge',
+  help: 'Connection counts',
+  key_labels: [
+    'datname',
+    'state',
+  ],
+  values: [
+    'count',
+  ],
+  query: importstr 'sql_exporter/connection_counts.sql',
+}
--- a/compute/etc/sql_exporter/connection_counts.sql
+++ b/compute/etc/sql_exporter/connection_counts.sql
@@ -0,0 +1 @@
+SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
--- a/compute/etc/sql_exporter/db_total_size.libsonnet
+++ b/compute/etc/sql_exporter/db_total_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'db_total_size',
+  type: 'gauge',
+  help: 'Size of all databases',
+  key_labels: null,
+  values: [
+    'total',
+  ],
+  query: importstr 'sql_exporter/db_total_size.sql',
+}
--- a/compute/etc/sql_exporter/db_total_size.sql
+++ b/compute/etc/sql_exporter/db_total_size.sql
@@ -0,0 +1 @@
+SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
--- a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of LFC read operation latencies',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql',
+}
--- a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket';
--- a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of read operations in LFC',
+  values: [
+    'file_cache_read_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in LFC read operations',
+  values: [
+    'file_cache_read_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of LFC write operation latencies',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql',
+}
--- a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket';
--- a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of write operations in LFC',
+  values: [
+    'file_cache_write_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in LFC write operations',
+  values: [
+    'file_cache_write_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
+++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_discards_total',
+  type: 'counter',
+  help: 'Number of prefetch responses issued but not used',
+  values: [
+    'getpage_prefetch_discards_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
+++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_misses_total',
+  type: 'counter',
+  help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read",
+  values: [
+    'getpage_prefetch_misses_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
+++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_requests_total',
+  type: 'counter',
+  help: 'Number of getpage issued for prefetching',
+  values: [
+    'getpage_prefetch_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
+++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetches_buffered',
+  type: 'gauge',
+  help: 'Number of prefetched pages buffered in neon',
+  values: [
+    'getpage_prefetches_buffered',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
+++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_sync_requests_total',
+  type: 'counter',
+  help: 'Number of synchronous getpage issued',
+  values: [
+    'getpage_sync_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'getpage_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of getpage request latency',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql',
+}
--- a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
--- a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of getpage requests',
+  values: [
+    'getpage_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in getpage requests',
+  values: [
+    'getpage_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
@@ -0,0 +1,12 @@
+// DEPRECATED
+
+{
+  metric_name: 'lfc_approximate_working_set_size',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: null,
+  values: [
+    'approximate_working_set_size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql',
+}
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
@@ -0,0 +1 @@
+SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size;
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'lfc_approximate_working_set_size_windows',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: [
+    'duration_seconds',
+  ],
+  values: [
+    'size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql',
+}
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
@@ -0,0 +1,8 @@
+-- NOTE: This is the "internal" / "machine-readable" version. This outputs the
+-- working set size looking back 1..60 minutes, labeled with the number of
+-- minutes.
+
+SELECT
+  x::text as duration_seconds,
+  neon.approximate_working_set_size_seconds(x) AS size
+FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'lfc_approximate_working_set_size_windows',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: [
+    'duration',
+  ],
+  values: [
+    'size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql',
+}
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
@@ -0,0 +1,8 @@
+-- NOTE: This is the "public" / "human-readable" version. Here, we supply a
+-- small selection of durations in a pretty-printed form.
+
+SELECT
+  x AS duration,
+  neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
+    VALUES ('5m'), ('15m'), ('1h')
+  ) AS t (x);
--- a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_cache_size_limit',
+  type: 'gauge',
+  help: 'LFC cache size limit in bytes',
+  key_labels: null,
+  values: [
+    'lfc_cache_size_limit',
+  ],
+  query: importstr 'sql_exporter/lfc_cache_size_limit.sql',
+}
--- a/compute/etc/sql_exporter/lfc_cache_size_limit.sql
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql
@@ -0,0 +1 @@
+SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
--- a/compute/etc/sql_exporter/lfc_hits.libsonnet
+++ b/compute/etc/sql_exporter/lfc_hits.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_hits',
+  type: 'gauge',
+  help: 'lfc_hits',
+  key_labels: null,
+  values: [
+    'lfc_hits',
+  ],
+  query: importstr 'sql_exporter/lfc_hits.sql',
+}
--- a/compute/etc/sql_exporter/lfc_hits.sql
+++ b/compute/etc/sql_exporter/lfc_hits.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits';
--- a/compute/etc/sql_exporter/lfc_misses.libsonnet
+++ b/compute/etc/sql_exporter/lfc_misses.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_misses',
+  type: 'gauge',
+  help: 'lfc_misses',
+  key_labels: null,
+  values: [
+    'lfc_misses',
+  ],
+  query: importstr 'sql_exporter/lfc_misses.sql',
+}
--- a/compute/etc/sql_exporter/lfc_misses.sql
+++ b/compute/etc/sql_exporter/lfc_misses.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses';
--- a/compute/etc/sql_exporter/lfc_used.libsonnet
+++ b/compute/etc/sql_exporter/lfc_used.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_used',
+  type: 'gauge',
+  help: 'LFC chunks used (chunk = 1MB)',
+  key_labels: null,
+  values: [
+    'lfc_used',
+  ],
+  query: importstr 'sql_exporter/lfc_used.sql',
+}
--- a/compute/etc/sql_exporter/lfc_used.sql
+++ b/compute/etc/sql_exporter/lfc_used.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used';
--- a/compute/etc/sql_exporter/lfc_writes.libsonnet
+++ b/compute/etc/sql_exporter/lfc_writes.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_writes',
+  type: 'gauge',
+  help: 'lfc_writes',
+  key_labels: null,
+  values: [
+    'lfc_writes',
+  ],
+  query: importstr 'sql_exporter/lfc_writes.sql',
+}
--- a/compute/etc/sql_exporter/lfc_writes.sql
+++ b/compute/etc/sql_exporter/lfc_writes.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes';
--- a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
@@ -0,0 +1,15 @@
+// Number of slots is limited by max_replication_slots, so collecting position
+// for all of them shouldn't be bad.
+
+{
+  metric_name: 'logical_slot_restart_lsn',
+  type: 'gauge',
+  help: 'restart_lsn of logical slots',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'restart_lsn',
+  ],
+  query: importstr 'sql_exporter/logical_slot_restart_lsn.sql',
+}
--- a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
@@ -0,0 +1,3 @@
+SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+FROM pg_replication_slots
+WHERE slot_type = 'logical';
--- a/compute/etc/sql_exporter/max_cluster_size.libsonnet
+++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'max_cluster_size',
+  type: 'gauge',
+  help: 'neon.max_cluster_size setting',
+  key_labels: null,
+  values: [
+    'max_cluster_size',
+  ],
+  query: importstr 'sql_exporter/max_cluster_size.sql',
+}
--- a/compute/etc/sql_exporter/max_cluster_size.sql
+++ b/compute/etc/sql_exporter/max_cluster_size.sql
@@ -0,0 +1 @@
+SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
--- a/compute/etc/sql_exporter/neon_perf_counters.sql
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -0,0 +1,19 @@
+WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters)
+
+SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
+  file_cache_read_wait_seconds_count numeric,
+  file_cache_read_wait_seconds_sum numeric,
+  file_cache_write_wait_seconds_count numeric,
+  file_cache_write_wait_seconds_sum numeric,
+  getpage_wait_seconds_count numeric,
+  getpage_wait_seconds_sum numeric,
+  getpage_prefetch_requests_total numeric,
+  getpage_sync_requests_total numeric,
+  getpage_prefetch_misses_total numeric,
+  getpage_prefetch_discards_total numeric,
+  getpage_prefetches_buffered numeric,
+  pageserver_requests_sent_total numeric,
+  pageserver_disconnects_total numeric,
+  pageserver_send_flushes_total numeric,
+  pageserver_open_requests numeric
+);
--- a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
+++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_disconnects_total',
+  type: 'counter',
+  help: 'Number of times that the connection to the pageserver was lost',
+  values: [
+    'pageserver_disconnects_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet
+++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_open_requests',
+  type: 'gauge',
+  help: 'Number of open requests to PageServer',
+  values: [
+    'pageserver_open_requests',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
+++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_requests_sent_total',
+  type: 'counter',
+  help: 'Number of all requests sent to the pageserver (not just GetPage requests)',
+  values: [
+    'pageserver_requests_sent_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
+++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_send_flushes_total',
+  type: 'counter',
+  help: 'Number of flushes to the pageserver connection',
+  values: [
+    'pageserver_send_flushes_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet
+++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet
@@ -0,0 +1,18 @@
+{
+  metric_name: 'pg_stats_userdb',
+  type: 'gauge',
+  help: 'Stats for several oldest non-system dbs',
+  key_labels: [
+    'datname',
+  ],
+  value_label: 'kind',
+  values: [
+    'db_size',
+    'deadlocks',
+    // Rows
+    'inserted',
+    'updated',
+    'deleted',
+  ],
+  query: importstr 'sql_exporter/pg_stats_userdb.sql',
+}
--- a/compute/etc/sql_exporter/pg_stats_userdb.sql
+++ b/compute/etc/sql_exporter/pg_stats_userdb.sql
@@ -0,0 +1,10 @@
+-- We export stats for 10 non-system databases. Without this limit it is too
+-- easy to abuse the system by creating lots of databases.
+
+SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
+  tup_updated AS updated, tup_deleted AS deleted, datname
+FROM pg_stat_database
+WHERE datname IN (
+  SELECT datname FROM pg_database
+  WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
+);
--- a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet
+++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'replication_delay_bytes',
+  type: 'gauge',
+  help: 'Bytes between received and replayed LSN',
+  key_labels: null,
+  values: [
+    'replication_delay_bytes',
+  ],
+  query: importstr 'sql_exporter/replication_delay_bytes.sql',
+}
--- a/compute/etc/sql_exporter/replication_delay_bytes.sql
+++ b/compute/etc/sql_exporter/replication_delay_bytes.sql
@@ -0,0 +1,6 @@
+-- We use a GREATEST call here because this calculation can be negative. The
+-- calculation is not atomic, meaning after we've gotten the receive LSN, the
+-- replay LSN may have advanced past the receive LSN we are using for the
+-- calculation.
+
+SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
--- a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet
+++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'replication_delay_seconds',
+  type: 'gauge',
+  help: 'Time since last LSN was replayed',
+  key_labels: null,
+  values: [
+    'replication_delay_seconds',
+  ],
+  query: importstr 'sql_exporter/replication_delay_seconds.sql',
+}
--- a/compute/etc/sql_exporter/replication_delay_seconds.sql
+++ b/compute/etc/sql_exporter/replication_delay_seconds.sql
@@ -0,0 +1,5 @@
+SELECT
+  CASE
+    WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+    ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+  END AS replication_delay_seconds;
--- a/compute/etc/sql_exporter/retained_wal.libsonnet
+++ b/compute/etc/sql_exporter/retained_wal.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'retained_wal',
+  type: 'gauge',
+  help: 'Retained WAL in inactive replication slots',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'retained_wal',
+  ],
+  query: importstr 'sql_exporter/retained_wal.sql',
+}
--- a/compute/etc/sql_exporter/retained_wal.sql
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -0,0 +1,5 @@
+SELECT
+  slot_name,
+  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+FROM pg_replication_slots
+WHERE active = false;
--- a/compute/etc/sql_exporter/wal_is_lost.libsonnet
+++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'wal_is_lost',
+  type: 'gauge',
+  help: 'Whether or not the replication slot wal_status is lost',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'wal_is_lost',
+  ],
+  query: importstr 'sql_exporter/wal_is_lost.sql',
+}
--- a/compute/etc/sql_exporter/wal_is_lost.sql
+++ b/compute/etc/sql_exporter/wal_is_lost.sql
@@ -0,0 +1,7 @@
+SELECT
+  slot_name,
+  CASE
+    WHEN wal_status = 'lost' THEN 1
+    ELSE 0
+  END AS wal_is_lost
+FROM pg_replication_slots;
--- a/compute/etc/sql_exporter_autoscaling.yml
+++ b/compute/etc/sql_exporter_autoscaling.yml
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter for autoscaling-agent
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector_autoscaling]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector_autoscaling.yml"
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -0,0 +1,126 @@
+# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
+---
+commands:
+  - name: cgconfigparser
+    user: root
+    sysvInitAction: sysinit
+    shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
+  - name: chmod-set-disk-quota
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/set-disk-quota'
+  - name: pgbouncer
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+  - name: local_proxy
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+  - name: postgres-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+  - name: sql-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+shutdownHook: |
+  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
+files:
+  - filename: compute_ctl-sudoers
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
+      # regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+  - filename: cgconfig.conf
+    content: |
+      # Configuration for cgroups in VM compute nodes
+      group neon-postgres {
+          perm {
+              admin {
+                  uid = postgres;
+              }
+              task {
+                  gid = users;
+              }
+          }
+          memory {}
+      }
+build: |
+  # Build cgroup-tools
+  #
+  # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
+  # requires cgroup v2, so we'll build cgroup-tools ourselves.
+  #
+  # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
+  # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
+  # for debian version migration.
+  #
+  FROM debian:bookworm-slim as libcgroup-builder
+  ENV LIBCGROUP_VERSION=v2.0.3
+
+  RUN set -exu \
+      && apt update \
+      && apt install --no-install-recommends -y \
+          git \
+          ca-certificates \
+          automake \
+          cmake \
+          make \
+          gcc \
+          byacc \
+          flex \
+          libtool \
+          libpam0g-dev \
+      && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+      && INSTALL_DIR="/libcgroup-install" \
+      && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+      && cd libcgroup \
+      # extracted from bootstrap.sh, with modified flags:
+      && (test -d m4 || mkdir m4) \
+      && autoreconf -fi \
+      && rm -rf autom4te.cache \
+      && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+      # actually build the thing...
+      && make install
+merge: |
+  # tweak nofile limits
+  RUN set -e \
+      && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
+      && test ! -e /etc/security || ( \
+         echo '*    - nofile 1048576' >>/etc/security/limits.conf \
+      && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
+         )
+
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
+
+  COPY cgconfig.conf /etc/cgconfig.conf
+
+  RUN set -e \
+      && chmod 0644 /etc/cgconfig.conf
+
+  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
+  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
+  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -97,7 +97,21 @@ impl ComputeControlPlane {
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
+            let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env);
+            let ep = match ep_res {
+                Ok(ep) => ep,
+                Err(e) => match e.downcast::<std::io::Error>() {
+                    Ok(e) => {
+                        // A parallel task could delete an endpoint while we have just scanned the directory
+                        if e.kind() == std::io::ErrorKind::NotFound {
+                            continue;
+                        } else {
+                            Err(e)?
+                        }
+                    }
+                    Err(e) => Err(e)?,
+                },
+            };
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -28,6 +28,9 @@ pub enum ApiError {
    #[error("Resource temporarily unavailable: {0}")]
    ResourceUnavailable(Cow<'static, str>),

+    #[error("Too many requests: {0}")]
+    TooManyRequests(Cow<'static, str>),
+
    #[error("Shutting down")]
    ShuttingDown,

@@ -73,6 +76,10 @@ impl ApiError {
                err.to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
+            ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::TOO_MANY_REQUESTS,
+            ),
            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::REQUEST_TIMEOUT,
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -16,7 +16,7 @@ use fail::fail_point;
 use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
-use std::time::SystemTime;
+use std::time::{Instant, SystemTime};
 use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;
@@ -352,12 +352,25 @@ where
            }
        }

-        for (path, content) in self
+        let start_time = Instant::now();
+        let aux_files = self
            .timeline
            .list_aux_files(self.lsn, self.ctx)
            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
-        {
+            .map_err(|e| BasebackupError::Server(e.into()))?;
+        let aux_scan_time = start_time.elapsed();
+        let aux_estimated_size = aux_files
+            .values()
+            .map(|content| content.len())
+            .sum::<usize>();
+        info!(
+            "Scanned {} aux files in {}ms, aux file content size = {}",
+            aux_files.len(),
+            aux_scan_time.as_millis(),
+            aux_estimated_size
+        );
+
+        for (path, content) in aux_files {
            if path.starts_with("pg_replslot") {
                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                let restart_lsn = Lsn(u64::from_le_bytes(
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
@@ -325,6 +326,7 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
        match value {
            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
            Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            Cancelled => ApiError::ShuttingDown,
            e @ HasArchivedParent(_) => {
                ApiError::PreconditionFailed(e.to_string().into_boxed_str())
            }
@@ -715,6 +717,8 @@ async fn timeline_archival_config_handler(
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;

+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
        tenant
            .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
            .await?;
@@ -1783,6 +1787,49 @@ async fn timeline_compact_handler(
    .await
 }

+// Run offload immediately on given timeline.
+async fn timeline_offload_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        if tenant.get_offloaded_timeline(timeline_id).is_ok() {
+            return json_response(StatusCode::OK, ());
+        }
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if !tenant.timeline_has_no_attached_children(timeline_id) {
+            return Err(ApiError::PreconditionFailed(
+                "timeline has attached children".into(),
+            ));
+        }
+        if !timeline.can_offload() {
+            return Err(ApiError::PreconditionFailed(
+                "Timeline::can_offload() returned false".into(),
+            ));
+        }
+        offload_timeline(&tenant, &timeline)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
@@ -3006,6 +3053,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
            |r| api_handler(r, timeline_compact_handler),
        )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
+            |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
+        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,8 +26,8 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::time::{Duration, Instant};
-use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -1137,10 +1137,10 @@ impl PageServerHandler {
            .await
            .map_err(map_basebackup_error)?;
        } else {
-            let mut writer = pgb.copyout_writer();
+            let mut writer = BufWriter::new(pgb.copyout_writer());
            if gzip {
                let mut encoder = GzipEncoder::with_quality(
-                    writer,
+                    &mut writer,
                    // NOTE using fast compression because it's on the critical path
                    //      for compute startup. For an empty database, we get
                    //      <100KB with this method. The Level::Best compression method
@@ -1175,6 +1175,10 @@ impl PageServerHandler {
                .await
                .map_err(map_basebackup_error)?;
            }
+            writer
+                .flush()
+                .await
+                .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1545,9 +1545,6 @@ impl<'a> DatadirModification<'a> {
            // Update relation size cache
            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

-            // Update relation size cache
-            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
-
            // Update logical database size.
            self.pending_nblocks -= old_size as i64 - nblocks as i64;
        }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Konstantin Knizhnik	3e8cb25e53	Increase range of expected value for working set approximation test	2024-10-16 18:59:19 +03:00
Tristan Partin	061ea0de7a	Add jsonnetfmt targets This should make it a little bit easier for people wanting to check if their files are formated correctly. Has the added bonus of making the CI check simpler as well. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-10-15 20:01:13 -05:00
Tristan Partin	be5d6a69dc	Fix jsonnet_files wildcard Just a typo in a path. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-10-15 16:30:31 -05:00
Matthias van de Meent	18f4e5f10c	Add newly added metrics from neondatabase/neon#9116 to exports (#9402 ) They weren't added in that PR, but should be available immediately on rollout as the neon extension already defaults to 1.5.	2024-10-15 23:13:31 +02:00
Alex Chi Z.	f1eb703256	fix(pageserver): use a buffer for basebackup; add aux basebackup metrics log (#9401 ) Our replication bench project is stuck because it is too slow to generate basebackup and it caused compute to disconnect. https://neondb.slack.com/archives/C03438W3FLZ/p1728330685012419 The compute timeout for waiting for basebackup is 10m (is it true?). Generating basebackup directly on pageserver takes ~3min. Therefore, I suspect it's because there are too many wasted round-trip time for writing the 10000+ snapshot aux files. Also, it is possible that the basebackup process takes too long time retrieving all aux files that it did not write anything over the wire protocol, causing a read timeout. Basebackup size is 800KB gzipped for that project and was 55MB tar before compression. ## Summary of changes * Potentially fix the issue by placing a write buffer for basebackup. * Log how many aux files did we read + the time spent on it. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-10-15 16:35:21 -04:00
Tristan Partin	cf7a596a15	Generate sql_exporter config files with Jsonnet There are quite a few benefits to this approach: - Reduce config duplication - The two sql_exporter configs were super similar with just a few differences - Pull SQL queries into standalone files - That means we could run a SQL formatter on the file in the future - It also means access to syntax highlighting - In the future, run different queries for different PG versions - This is relevant because right now, we have queries that are failing on PG 17 due to catalog updates Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-10-15 11:18:38 -05:00
Konstantin Knizhnik	614c3aef72	Remove redundant code (#9373 ) ## Problem There is double update of resize cache in `put_rel_truncation` Also `page_server_request` contains check that fork is MAIN_FORKNUM which 1. is incorrect (because Vm/FSM pages are shreded in the same way as MAIN fork pages and 2. is redundant because `page_server_request` is never called for `get page` request so first part to OR condition is always true. ## Summary of changes Remove redundant code ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-10-15 17:18:52 +03:00
Folke Behrens	fb74c21e8c	proxy: Migrate jwt module away from anyhow (#9361 )	2024-10-15 15:24:56 +02:00
Conrad Ludgate	d92d36a315	[local_proxy] update api for pg_session_jwt (#9359 ) pg_session_jwt now: 1. Sets the JWK in a PGU_BACKEND session guc, no longer in the init() function. 2. JWK no longer needs the kid.	2024-10-15 12:13:57 +00:00
Arpad Müller	ec4cc30de9	Shut down timelines during offload and add offload tests (#9289 ) Add a test for timeline offloading, and subsequent unoffloading. Also adds a manual endpoint, and issues a proper timeline shutdown during offloading which prevents a pageserver hang at shutdown. Part of #8088.	2024-10-15 09:46:51 +00:00
John Spray	73c6626b38	pageserver: stabilize & refine controller scale test (#8971 ) ## Problem We were seeing timeouts on migrations in this test. The test unfortunately tends to saturate local storage, which is shared between the pageservers and the control plane database, which makes the test kind of unrealistic. We will also want to increase the scale of this test, so it's worth fixing that. ## Summary of changes - Instead of randomly creating timelines at the same time as the other background operations, explicitly identify a subset of tenant which will have timelines, and create them at the start. This avoids pageservers putting a lot of load on the test node during the main body of the test. - Adjust the tenants created to create some number of 8 shard tenants and the rest 1 shard tenants, instead of just creating a lot of 2 shard tenants. - Use archival_config to exercise tenant-mutating operations, instead of using timeline creation for this. - Adjust reconcile_until_idle calls to avoid waiting 5 seconds between calls, which causes timelines with large shard count tenants. - Fix a pageserver bug where calls to archival_config during activation get 404	2024-10-15 09:31:18 +01:00
Alexander Bayandin	0fc4ada3ca	Switch CI, Storage and Proxy to Debian 12 (Bookworm) (#9170 ) ## Problem This PR switches CI and Storage to Debain 12 (Bookworm) based images. ## Summary of changes - Add Debian codename (`bookworm`/`bullseye`) to most of docker tags, create un-codenamed images to be used by default - `vm-compute-node-image`: create a separate spec for `bookworm` (we don't need to build cgroups in the future) - `neon-image`: Switch to `bookworm`-based `build-tools` image - Storage components and Proxy use it - CI: run lints and tests on `bookworm`-based `build-tools` image	2024-10-14 21:12:43 +01:00
Matthias van de Meent	dab96a6eb1	Add more timing histogram and gauge metrics to the Neon extension (#9116 ) We now also track: - Number of PS IOs in-flight - Number of pages cached by smgr prefetch implementation - IO timing histograms for LFC reads and writes, per IO issued ## Problem There's little insight into the timing metrics of LFC, and what the prefetch state of each backend is. This changes that, by measuring (and subsequently exposing) these data points. ## Summary of changes - Extract IOHistogram as separate type, rather than a collection of fields on NeonMetrics - others, see items above. Part of https://github.com/neondatabase/neon/issues/8926	2024-10-14 20:30:21 +02:00
Arpad Müller	f54e3e9147	Also consider offloaded timelines for obtaining retain_lsn (#9308 ) Also consider offloaded timelines for obtaining `retain_lsn`. This is required for correctness for all timelines that have not been flattened yet: otherwise we GC data that might still be required for reading. This somewhat counteracts the original purpose of timeline offloading of not having to iterate over offloaded timelines, but sadly it's required. In the future, we can improve the way the offloaded timelines are stored. We also make the `retain_lsn` optional so that in the future, when we implement flattening, we can make it None. This also applies to full timeline objects by the way, where it would probably make most sense to add a bool flag whether the timeline is successfully flattened, and if it is, one can exclude it from `retain_lsn` as well. Also, track whether a timeline was offloaded or not in `retain_lsn` so that the `retain_lsn` can be excluded from visibility and size calculation. Part of #8088	2024-10-14 17:54:03 +02:00
Vlad Lazar	f4f7ea247c	tests: make size comparisons more lenient (#9388 ) The empirically determined threshold doesn't hold for PG 17. Bump the limit to stabilise ci.	2024-10-14 16:50:12 +01:00
Arpad Müller	d92ff578c4	Add test for fixed storage broker issue (#9311 ) Adds a test for the (now fixed) storage broker limit issue, see #9268 for the description and #9299 for the fix. Also fix a race condition with endpoint creation/starts running in parallel, leading to file not found errors.	2024-10-14 14:34:57 +02:00
Alexander Bayandin	31b7703fa8	CI(build-build-tools): fix unexpected cancellations (#9357 ) ## Problem When `Dockerfile.build-tools` gets changed, several PRs catch up with it and some might get unexpectedly cancelled workflows because of GitHub's concurrency model for workflows. See the comment in the code for more details. It should be possible to revert it after https://github.com/orgs/community/discussions/41518 (I don't expect it anytime soon, but I subscribed) ## Summary of changes - Do not queue `build-build-tools-image` workflows in the concurrency group	2024-10-14 11:51:01 +01:00
Konstantin Knizhnik	d056ae9be5	Ignore pg_dynshmem fiel when comparing directories (#9374 ) ## Problem At MacOS `pg_dynshmem` file is create in PGDATADIR which cause mismatch in directories comparison ## Summary of changes Add this files to the ignore list. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-10-14 13:45:20 +03:00
Conrad Ludgate	cb9ab7463c	proxy: split out the console-redirect backend flow (#9270 ) removes the ConsoleRedirect backend from the main auth::Backends enum, copy-paste the existing crate::proxy::task_main structure to use the ConsoleRedirectBackend exclusively. This makes the logic a bit simpler at the cost of some fairly trivial code duplication.	2024-10-14 12:25:55 +02:00
				`@@ -0,0 +1 @@`
				`SELECT checkpoints_req FROM pg_stat_bgwriter;`
				`@@ -0,0 +1 @@`
				`SELECT checkpoints_timed FROM pg_stat_bgwriter;`
				`@@ -0,0 +1 @@`
				`SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;`
				`@@ -0,0 +1 @@`
				`SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;`
				`@@ -0,0 +1 @@`
				`SELECT sum(pg_database_size(datname)) AS total FROM pg_database;`
				`@@ -0,0 +1 @@`
				`SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket';`
				`@@ -0,0 +1 @@`
				`SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size;`
				`@@ -0,0 +1 @@`
				`SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;`
				`@@ -0,0 +1 @@`
				`SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits';`