DO NOT MERGE: test cache-from

storcon: add tags to scheduler logs (#9127 )
2026-02-03 18:50:38 +00:00 · 2024-09-25 13:52:44 +01:00 · 2024-09-25 12:19:58 +01:00 · 2024-09-25 10:16:06 +01:00 · 2024-09-24 22:33:03 +00:00 · 2024-09-24 18:05:23 -04:00
85 changed files with 6668 additions and 1184 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -13,6 +13,7 @@
 # Directories
 !.cargo/
 !.config/
+!compute/
 !compute_tools/
 !control_plane/
 !libs/
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -36,7 +36,7 @@ jobs:

    strategy:
      matrix:
-        arch: [ x64, arm64 ]
+        arch: [ x64 ]

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

@@ -79,7 +79,9 @@ jobs:
          push: true
          pull: true
          file: Dockerfile.build-tools
-          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
+          cache-from: |
+            type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
+            type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}-does-not-exist-2
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -6,6 +6,7 @@ on:
      - main
      - release
      - release-proxy
+      - bayandin/test
  pull_request:

 defaults:
@@ -27,7 +28,7 @@ env:

 jobs:
  check-permissions:
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    if: false
    uses: ./.github/workflows/check-permissions.yml
    with:
      github-event-name: ${{ github.event_name }}
@@ -78,7 +79,6 @@ jobs:
        id: build-tag

  check-build-tools-image:
-    needs: [ check-permissions ]
    uses: ./.github/workflows/check-build-tools-image.yml

  build-build-tools-image:
@@ -120,6 +120,59 @@ jobs:
      - name: Run mypy to check types
        run: poetry run mypy .

+  # Check that the vendor/postgres-* submodules point to the
+  # corresponding REL_*_STABLE_neon branches.
+  check-submodules:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - uses: dorny/paths-filter@v3
+        id: check-if-submodules-changed
+        with:
+          filters: |
+            vendor:
+              - 'vendor/**'
+
+      - name: Check vendor/postgres-v14 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v14"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v15 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v15"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v16 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v16"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v17 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v17"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
  check-codestyle-rust:
    needs: [ check-permissions, build-build-tools-image ]
    strategy:
@@ -598,7 +651,7 @@ jobs:
          provenance: false
          push: true
          pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
@@ -617,7 +670,7 @@ jobs:
          provenance: false
          push: true
          pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
          target: neon-pg-ext-test
          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
@@ -638,7 +691,7 @@ jobs:
          provenance: false
          push: true
          pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
          tags: |
            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -726,7 +779,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
-            -spec=vm-image-spec.yaml \
+            -spec=compute/vm-image-spec.yaml \
            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}

--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -33,7 +33,7 @@ jobs:
          IMAGE_TAG: |
            ${{ hashFiles('Dockerfile.build-tools',
                          '.github/workflows/check-build-tools-image.yml',
-                          '.github/workflows/build-build-tools-image.yml') }}
+                          '.github/workflows/build-build-tools-image.yml') }}-test
        run: |
          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT

--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -0,0 +1,102 @@
+name: Cloud Regression Test
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  regress:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    runs-on: us-east-2
+    container:
+      image: neondatabase/build-tools:pinned
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Patch the test
+        run: |
+          cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
+          patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+
+      - name: Generate a random password
+        id: pwgen
+        run: |
+          set +x
+          DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
+          echo "::add-mask::${DBPASS//\//}"
+          echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
+
+      - name: Change tests according to the generated password
+        env:
+          DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
+        run: |
+          cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
+          for fname in sql/*.sql expected/*.out; do
+            sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
+          done
+          for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
+            USER=$(echo "${ph}" | cut -c 22-)
+            MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
+            sed -i.bak "s/${ph}/${MD5}/" expected/password.out
+          done
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+
+      - name: Run the regression tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ env.BUILD_TYPE }}
+          test_selection: cloud_regress
+          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          extra_params: -m remote_cluster
+        env:
+          BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
+
+      - name: Create Allure report
+        id: create-allure-report
+        if: ${{ !cancelled() }}
+        uses: ./.github/actions/allure-report-generate
+
+      - name: Post to a Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          slack-message: |
+            Periodic pg_regress on staging: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+            <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -107,7 +107,7 @@ jobs:
          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
              case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
+                vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -255,12 +255,6 @@ dependencies = [
 "syn 2.0.52",
 ]

-[[package]]
-name = "atomic"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
-
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -295,8 +289,8 @@ dependencies = [
 "fastrand 2.0.0",
 "hex",
 "http 0.2.9",
- "hyper 0.14.26",
- "ring 0.17.6",
+ "hyper 0.14.30",
+ "ring",
 "time",
 "tokio",
 "tracing",
@@ -486,7 +480,7 @@ dependencies = [
 "once_cell",
 "p256 0.11.1",
 "percent-encoding",
- "ring 0.17.6",
+ "ring",
 "sha2",
 "subtle",
 "time",
@@ -593,7 +587,7 @@ dependencies = [
 "http 0.2.9",
 "http-body 0.4.5",
 "http-body 1.0.0",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper-rustls 0.24.0",
 "once_cell",
 "pin-project-lite",
@@ -684,7 +678,7 @@ dependencies = [
 "futures-util",
 "http 0.2.9",
 "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "itoa",
 "matchit 0.7.0",
 "memchr",
@@ -1089,9 +1083,9 @@ dependencies = [

 [[package]]
 name = "ciborium"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
 dependencies = [
 "ciborium-io",
 "ciborium-ll",
@@ -1100,18 +1094,18 @@ dependencies = [

 [[package]]
 name = "ciborium-io"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"

 [[package]]
 name = "ciborium-ll"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 dependencies = [
 "ciborium-io",
- "half 1.8.2",
+ "half",
 ]

 [[package]]
@@ -1224,7 +1218,7 @@ dependencies = [
 "compute_api",
 "flate2",
 "futures",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "nix 0.27.1",
 "notify",
 "num_cpus",
@@ -1330,7 +1324,7 @@ dependencies = [
 "git-version",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "nix 0.27.1",
 "once_cell",
 "pageserver_api",
@@ -2304,12 +2298,6 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
 [[package]]
 name = "half"
 version = "2.4.1"
@@ -2411,17 +2399,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "hostname"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
-dependencies = [
- "libc",
- "match_cfg",
- "winapi",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2430,7 +2407,7 @@ checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
 dependencies = [
 "cfg-if",
 "libc",
- "windows 0.52.0",
+ "windows",
 ]

 [[package]]
@@ -2539,9 +2516,9 @@ dependencies = [

 [[package]]
 name = "hyper"
-version = "0.14.26"
+version = "0.14.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4"
+checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
 dependencies = [
 "bytes",
 "futures-channel",
@@ -2554,7 +2531,7 @@ dependencies = [
 "httpdate",
 "itoa",
 "pin-project-lite",
- "socket2 0.4.9",
+ "socket2",
 "tokio",
 "tower-service",
 "tracing",
@@ -2589,7 +2566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "log",
 "rustls 0.21.11",
 "rustls-native-certs 0.6.2",
@@ -2620,7 +2597,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "pin-project-lite",
 "tokio",
 "tokio-io-timeout",
@@ -2639,7 +2616,7 @@ dependencies = [
 "http-body 1.0.0",
 "hyper 1.2.0",
 "pin-project-lite",
- "socket2 0.5.5",
+ "socket2",
 "tokio",
 "tower",
 "tower-service",
@@ -2648,16 +2625,16 @@ dependencies = [

 [[package]]
 name = "iana-time-zone"
-version = "0.1.56"
+version = "0.1.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
 dependencies = [
 "android_system_properties",
 "core-foundation-sys",
 "iana-time-zone-haiku",
 "js-sys",
 "wasm-bindgen",
- "windows 0.48.0",
+ "windows-core",
 ]

 [[package]]
@@ -2870,7 +2847,7 @@ dependencies = [
 "base64 0.21.1",
 "js-sys",
 "pem",
- "ring 0.17.6",
+ "ring",
 "serde",
 "serde_json",
 "simple_asn1",
@@ -2908,11 +2885,11 @@ dependencies = [

 [[package]]
 name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 dependencies = [
- "spin 0.5.2",
+ "spin",
 ]

 [[package]]
@@ -2974,12 +2951,6 @@ dependencies = [
 "hashbrown 0.14.5",
 ]

-[[package]]
-name = "match_cfg"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
-
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -3072,15 +3043,6 @@ dependencies = [
 "autocfg",
 ]

-[[package]]
-name = "memoffset"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.9.0"
@@ -3660,7 +3622,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "indoc",
 "itertools 0.10.5",
 "md5",
@@ -3853,7 +3815,7 @@ dependencies = [
 "ahash",
 "bytes",
 "chrono",
- "half 2.4.1",
+ "half",
 "hashbrown 0.14.5",
 "num",
 "num-bigint",
@@ -4140,7 +4102,7 @@ dependencies = [
 "crc32c",
 "env_logger",
 "log",
- "memoffset 0.8.0",
+ "memoffset 0.9.0",
 "once_cell",
 "postgres",
 "regex",
@@ -4350,12 +4312,12 @@ dependencies = [
 "hashlink",
 "hex",
 "hmac",
- "hostname 0.3.1",
+ "hostname",
 "http 1.1.0",
 "http-body-util",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper 1.2.0",
 "hyper-util",
 "indexmap 2.0.1",
@@ -4400,7 +4362,7 @@ dependencies = [
 "signature 2.2.0",
 "smallvec",
 "smol_str",
- "socket2 0.5.5",
+ "socket2",
 "subtle",
 "thiserror",
 "tikv-jemalloc-ctl",
@@ -4578,7 +4540,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
 "pem",
- "ring 0.17.6",
+ "ring",
 "time",
 "yasna",
 ]
@@ -4602,7 +4564,7 @@ dependencies = [
 "rustls-pki-types",
 "ryu",
 "sha1_smol",
- "socket2 0.5.5",
+ "socket2",
 "tokio",
 "tokio-rustls 0.25.0",
 "tokio-util",
@@ -4714,7 +4676,7 @@ dependencies = [
 "futures-util",
 "http-types",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "itertools 0.10.5",
 "metrics",
 "once_cell",
@@ -4747,7 +4709,7 @@ dependencies = [
 "h2 0.3.26",
 "http 0.2.9",
 "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper-rustls 0.24.0",
 "ipnet",
 "js-sys",
@@ -4905,21 +4867,6 @@ dependencies = [
 "subtle",
 ]

-[[package]]
-name = "ring"
-version = "0.16.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
-dependencies = [
- "cc",
- "libc",
- "once_cell",
- "spin 0.5.2",
- "untrusted 0.7.1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "ring"
 version = "0.17.6"
@@ -4929,8 +4876,8 @@ dependencies = [
 "cc",
 "getrandom 0.2.11",
 "libc",
- "spin 0.9.8",
- "untrusted 0.9.0",
+ "spin",
+ "untrusted",
 "windows-sys 0.48.0",
 ]

@@ -4950,7 +4897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "lazy_static",
 "percent-encoding",
 "regex",
@@ -5074,7 +5021,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
 "log",
- "ring 0.17.6",
+ "ring",
 "rustls-webpki 0.101.7",
 "sct",
 ]
@@ -5086,7 +5033,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
 "log",
- "ring 0.17.6",
+ "ring",
 "rustls-pki-types",
 "rustls-webpki 0.102.2",
 "subtle",
@@ -5143,24 +5090,14 @@ version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"

-[[package]]
-name = "rustls-webpki"
-version = "0.100.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
-dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
-]
-
 [[package]]
 name = "rustls-webpki"
 version = "0.101.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]

 [[package]]
@@ -5169,9 +5106,9 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
- "ring 0.17.6",
+ "ring",
 "rustls-pki-types",
- "untrusted 0.9.0",
+ "untrusted",
 ]

 [[package]]
@@ -5205,7 +5142,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5262,11 +5199,11 @@ dependencies = [

 [[package]]
 name = "schannel"
-version = "0.1.21"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys 0.42.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -5290,8 +5227,8 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]

 [[package]]
@@ -5400,7 +5337,7 @@ version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname 0.4.0",
+ "hostname",
 "libc",
 "os_info",
 "rustc_version",
@@ -5712,16 +5649,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "socket2"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "socket2"
 version = "0.5.5"
@@ -5732,12 +5659,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "spin"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
-
 [[package]]
 name = "spin"
 version = "0.9.8"
@@ -5783,7 +5704,7 @@ dependencies = [
 "futures-util",
 "git-version",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5812,7 +5733,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "itertools 0.10.5",
 "lasso",
 "measured",
@@ -6228,7 +6149,7 @@ dependencies = [
 "num_cpus",
 "pin-project-lite",
 "signal-hook-registry",
- "socket2 0.5.5",
+ "socket2",
 "tokio-macros",
 "windows-sys 0.48.0",
 ]
@@ -6288,7 +6209,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
- "socket2 0.5.5",
+ "socket2",
 "tokio",
 "tokio-util",
 ]
@@ -6300,7 +6221,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
 "futures",
- "ring 0.17.6",
+ "ring",
 "rustls 0.22.4",
 "tokio",
 "tokio-postgres",
@@ -6434,7 +6355,7 @@ dependencies = [
 "h2 0.3.26",
 "http 0.2.9",
 "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper-timeout",
 "percent-encoding",
 "pin-project",
@@ -6611,7 +6532,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "opentelemetry",
 "opentelemetry-otlp",
 "opentelemetry-semantic-conventions",
@@ -6714,12 +6635,6 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"

-[[package]]
-name = "untrusted"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
-
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -6728,17 +6643,18 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"

 [[package]]
 name = "ureq"
-version = "2.7.1"
+version = "2.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.21.11",
- "rustls-webpki 0.100.2",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
 "url",
- "webpki-roots 0.23.1",
+ "webpki-roots 0.26.1",
 ]

 [[package]]
@@ -6802,7 +6718,7 @@ dependencies = [
 "hex",
 "hex-literal",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
@@ -6837,11 +6753,10 @@ dependencies = [

 [[package]]
 name = "uuid"
-version = "1.6.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
 dependencies = [
- "atomic",
 "getrandom 0.2.11",
 "serde",
 ]
@@ -7075,15 +6990,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "webpki-roots"
-version = "0.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
-dependencies = [
- "rustls-webpki 0.100.2",
-]
-
 [[package]]
 name = "webpki-roots"
 version = "0.25.2"
@@ -7152,15 +7058,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

-[[package]]
-name = "windows"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
-dependencies = [
- "windows-targets 0.48.0",
-]
-
 [[package]]
 name = "windows"
 version = "0.52.0"
@@ -7180,21 +7077,6 @@ dependencies = [
 "windows-targets 0.52.4",
 ]

-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7243,12 +7125,6 @@ dependencies = [
 "windows_x86_64_msvc 0.52.4",
 ]

-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.48.0"
@@ -7261,12 +7137,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"

-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.0"
@@ -7279,12 +7149,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"

-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.0"
@@ -7297,12 +7161,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"

-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.0"
@@ -7315,12 +7173,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"

-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.0"
@@ -7333,12 +7185,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"

-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.0"
@@ -7351,12 +7197,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"

-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.0"
@@ -7433,10 +7273,11 @@ dependencies = [
 "futures-util",
 "generic-array",
 "getrandom 0.2.11",
+ "half",
 "hashbrown 0.14.5",
 "hex",
 "hmac",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "indexmap 1.9.3",
 "itertools 0.10.5",
 "itertools 0.12.1",
@@ -7504,7 +7345,7 @@ dependencies = [
 "der 0.7.8",
 "hex",
 "pem",
- "ring 0.17.6",
+ "ring",
 "signature 2.2.0",
 "spki 0.7.3",
 "thiserror",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,8 +76,6 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-crossbeam-deque = "0.8.5"
-crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
@@ -95,7 +93,7 @@ hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
-hostname = "0.3.1"
+hostname = "0.4"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
@@ -104,7 +102,6 @@ hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
 indoc = "2"
-inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
@@ -113,7 +110,7 @@ libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
-memoffset = "0.8"
+memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
@@ -142,7 +139,6 @@ rpds = "0.13"
 rustc-hash = "1.1.0"
 rustls = "0.22"
 rustls-pemfile = "2"
-rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
@@ -164,7 +160,6 @@ strum_macros = "0.26"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
-task-local-extensions = "0.1.4"
 test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -280,7 +280,7 @@ FROM build-deps AS vector-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-COPY patches/pgvector.patch /pgvector.patch
+COPY compute/patches/pgvector.patch /pgvector.patch

 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
@@ -366,7 +366,7 @@ FROM build-deps AS rum-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-COPY patches/rum.patch /rum.patch
+COPY compute/patches/rum.patch /rum.patch

 RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -1031,6 +1031,41 @@ FROM debian:bullseye-slim AS compute-tools-image

 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

+#########################################################################################
+#
+# Layer "pgbouncer"
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS pgbouncer
+RUN set -e \
+    && apt-get update \
+    && apt-get install -y \
+        build-essential \
+        git \
+        libevent-dev \
+        libtool \
+        pkg-config
+
+# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
+ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+RUN set -e \
+    && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
+    && cd pgbouncer \
+    && ./autogen.sh \
+    && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
+    && make -j $(nproc) dist_man_MANS= \
+    && make install dist_man_MANS=
+
+#########################################################################################
+#
+# Layers "postgres-exporter" and "sql-exporter"
+#
+#########################################################################################
+
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1078,7 +1113,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
 COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+COPY compute/patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -1086,9 +1121,9 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hint_plan.patch /ext-src
+COPY compute/patches/pg_hint_plan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY patches/pg_cron.patch /ext-src
+COPY compute/patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
 #COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
@@ -1097,7 +1132,7 @@ COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
 COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
-COPY patches/pg_anon.patch /ext-src
+COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
 RUN case "${PG_VERSION}" in "v17") \
@@ -1160,9 +1195,23 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

+# pgbouncer and its config
+COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
+COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
+
+# Metrics exporter binaries and  configuration files
+COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
+
+COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions

+
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute/README.md
+++ b/compute/README.md
@@ -0,0 +1,21 @@
+This directory contains files that are needed to build the compute
+images, or included in the compute images.
+
+Dockerfile.compute-node
+	To build the compute image
+
+vm-image-spec.yaml
+	Instructions for vm-builder, to turn the compute-node image into
+	corresponding vm-compute-node image.
+
+etc/
+	Configuration files included in /etc in the compute image
+
+patches/
+	Some extensions need to be patched to work with Neon. This
+	directory contains such patches. They are applied to the extension
+	sources in Dockerfile.compute-node
+
+In addition to these, postgres itself, the neon postgres extension,
+and compute_ctl are built and copied into the compute image by
+Dockerfile.compute-node.
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -0,0 +1,247 @@
+collector_name: neon_collector
+metrics:
+- metric_name: lfc_misses
+  type: gauge
+  help: 'lfc_misses'
+  key_labels:
+  values: [lfc_misses]
+  query: |
+    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+- metric_name: lfc_used
+  type: gauge
+  help: 'LFC chunks used (chunk = 1MB)'
+  key_labels:
+  values: [lfc_used]
+  query: |
+    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+- metric_name: lfc_hits
+  type: gauge
+  help: 'lfc_hits'
+  key_labels:
+  values: [lfc_hits]
+  query: |
+    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+- metric_name: lfc_writes
+  type: gauge
+  help: 'lfc_writes'
+  key_labels:
+  values: [lfc_writes]
+  query: |
+    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+- metric_name: lfc_cache_size_limit
+  type: gauge
+  help: 'LFC cache size limit in bytes'
+  key_labels:
+  values: [lfc_cache_size_limit]
+  query: |
+    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+- metric_name: connection_counts
+  type: gauge
+  help: 'Connection counts'
+  key_labels:
+    - datname
+    - state
+  values: [count]
+  query: |
+    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
+
+- metric_name: pg_stats_userdb
+  type: gauge
+  help: 'Stats for several oldest non-system dbs'
+  key_labels:
+    - datname
+  value_label: kind
+  values:
+    - db_size
+    - deadlocks
+    # Rows
+    - inserted
+    - updated
+    - deleted
+  # We export stats for 10 non-system database. Without this limit
+  # it is too easy to abuse the system by creating lots of databases.
+  query: |
+    select pg_database_size(datname) as db_size, deadlocks,
+       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+       datname
+     from pg_stat_database
+     where datname IN (
+       select datname
+       from pg_database
+       where datname <> 'postgres' and not datistemplate
+       order by oid
+       limit 10
+     );
+
+- metric_name: max_cluster_size
+  type: gauge
+  help: 'neon.max_cluster_size setting'
+  key_labels:
+  values: [max_cluster_size]
+  query: |
+    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+- metric_name: db_total_size
+  type: gauge
+  help: 'Size of all databases'
+  key_labels:
+  values: [total]
+  query: |
+    select sum(pg_database_size(datname)) as total from pg_database;
+
+# DEPRECATED
+- metric_name: lfc_approximate_working_set_size
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels:
+  values: [approximate_working_set_size]
+  query: |
+    select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
+- metric_name: lfc_approximate_working_set_size_windows
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels: [duration]
+  values: [size]
+  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
+  # of durations in a pretty-printed form.
+  query: |
+    select
+      x as duration,
+      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
+    from
+      (values ('5m'),('15m'),('1h')) as t (x);
+
+- metric_name: compute_current_lsn
+  type: gauge
+  help: 'Current LSN of the database'
+  key_labels:
+  values: [lsn]
+  query: |
+    select
+      case
+        when pg_catalog.pg_is_in_recovery()
+        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+        else (pg_current_wal_lsn() - '0/0')::FLOAT8
+      end as lsn;
+
+- metric_name: compute_receive_lsn
+  type: gauge
+  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
+  key_labels:
+  values: [lsn]
+  query: |
+    SELECT
+      CASE
+        WHEN pg_catalog.pg_is_in_recovery()
+        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+        ELSE 0
+      END AS lsn;
+
+- metric_name: replication_delay_bytes
+  type: gauge
+  help: 'Bytes between received and replayed LSN'
+  key_labels:
+  values: [replication_delay_bytes]
+  # We use a GREATEST call here because this calculation can be negative.
+  # The calculation is not atomic, meaning after we've gotten the receive
+  # LSN, the replay LSN may have advanced past the receive LSN we
+  # are using for the calculation.
+  query: |
+    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
+
+- metric_name: replication_delay_seconds
+  type: gauge
+  help: 'Time since last LSN was replayed'
+  key_labels:
+  values: [replication_delay_seconds]
+  query: |
+    SELECT
+      CASE
+        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+     END AS replication_delay_seconds;
+
+- metric_name: checkpoints_req
+  type: gauge
+  help: 'Number of requested checkpoints'
+  key_labels:
+  values: [checkpoints_req]
+  query: |
+    SELECT checkpoints_req FROM pg_stat_bgwriter;
+
+- metric_name: checkpoints_timed
+  type: gauge
+  help: 'Number of scheduled checkpoints'
+  key_labels:
+  values: [checkpoints_timed]
+  query: |
+    SELECT checkpoints_timed FROM pg_stat_bgwriter;
+
+- metric_name: compute_logical_snapshot_files
+  type: gauge
+  help: 'Number of snapshot files in pg_logical/snapshot'
+  key_labels:
+    - timeline_id
+  values: [num_logical_snapshot_files]
+  query: |
+    SELECT
+      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
+      -- temporary snapshot files are renamed to the actual snapshot files after they are
+      -- completely built. We only WAL-log the completely built snapshot files.
+      (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+
+# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
+# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
+- metric_name: logical_slot_restart_lsn
+  type: gauge
+  help: 'restart_lsn of logical slots'
+  key_labels:
+    - slot_name
+  values: [restart_lsn]
+  query: |
+    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+    from pg_replication_slots
+    where slot_type = 'logical';
+
+- metric_name: compute_subscriptions_count
+  type: gauge
+  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
+  key_labels:
+    - enabled
+  values: [subscriptions_count]
+  query: |
+    select subenabled::text as enabled, count(*) as subscriptions_count
+    from pg_subscription
+    group by subenabled;
+
+- metric_name: retained_wal
+  type: gauge
+  help: 'Retained WAL in inactive replication slots'
+  key_labels:
+    - slot_name
+  values: [retained_wal]
+  query: |
+    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+    FROM pg_replication_slots
+    WHERE active = false;
+
+- metric_name: wal_is_lost
+  type: gauge
+  help: 'Whether or not the replication slot wal_status is lost'
+  key_labels:
+    - slot_name
+  values: [wal_is_lost]
+  query: |
+    SELECT slot_name,
+           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
+    FROM pg_replication_slots;
+
--- a/compute/etc/neon_collector_autoscaling.yml
+++ b/compute/etc/neon_collector_autoscaling.yml
@@ -0,0 +1,55 @@
+collector_name: neon_collector_autoscaling
+metrics:
+- metric_name: lfc_misses
+  type: gauge
+  help: 'lfc_misses'
+  key_labels:
+  values: [lfc_misses]
+  query: |
+    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+- metric_name: lfc_used
+  type: gauge
+  help: 'LFC chunks used (chunk = 1MB)'
+  key_labels:
+  values: [lfc_used]
+  query: |
+    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+- metric_name: lfc_hits
+  type: gauge
+  help: 'lfc_hits'
+  key_labels:
+  values: [lfc_hits]
+  query: |
+    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+- metric_name: lfc_writes
+  type: gauge
+  help: 'lfc_writes'
+  key_labels:
+  values: [lfc_writes]
+  query: |
+    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+- metric_name: lfc_cache_size_limit
+  type: gauge
+  help: 'LFC cache size limit in bytes'
+  key_labels:
+  values: [lfc_cache_size_limit]
+  query: |
+    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+- metric_name: lfc_approximate_working_set_size_windows
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels: [duration_seconds]
+  values: [size]
+  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
+  # size looking back 1..60 minutes, labeled with the number of minutes.
+  query: |
+    select
+      x::text as duration_seconds,
+      neon.approximate_working_set_size_seconds(x) as size
+    from
+      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -0,0 +1,17 @@
+[databases]
+*=host=localhost port=5432 auth_user=cloud_admin
+[pgbouncer]
+listen_port=6432
+listen_addr=0.0.0.0
+auth_type=scram-sha-256
+auth_user=cloud_admin
+auth_dbname=postgres
+client_tls_sslmode=disable
+server_tls_sslmode=disable
+pool_mode=transaction
+max_client_conn=10000
+default_pool_size=64
+max_prepared_statements=0
+admin_users=postgres
+unix_socket_dir=/tmp/
+unix_socket_mode=0777
--- a/compute/etc/sql_exporter.yml
+++ b/compute/etc/sql_exporter.yml
@@ -0,0 +1,33 @@
+# Configuration for sql_exporter
+# Global defaults.
+global:
+  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+  scrape_timeout: 10s
+  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+  scrape_timeout_offset: 500ms
+  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+  min_interval: 0s
+  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+  # as will concurrent scrapes.
+  max_connections: 1
+  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+  # always be the same as max_connections.
+  max_idle_connections: 1
+  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+  # If 0, connections are not closed due to a connection's age.
+  max_connection_lifetime: 5m
+
+# The target to monitor and the collectors to execute on it.
+target:
+  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+  # the schema gets dropped or replaced to match the driver expected DSN format.
+  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
+
+  # Collectors (referenced by name) to execute on the target.
+  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collectors: [neon_collector]
+
+# Collector files specifies a list of globs. One collector definition is read from each matching file.
+# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+collector_files:
+  - "neon_collector.yml"
--- a/compute/etc/sql_exporter_autoscaling.yml
+++ b/compute/etc/sql_exporter_autoscaling.yml
@@ -0,0 +1,33 @@
+# Configuration for sql_exporter for autoscaling-agent
+# Global defaults.
+global:
+  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+  scrape_timeout: 10s
+  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+  scrape_timeout_offset: 500ms
+  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+  min_interval: 0s
+  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+  # as will concurrent scrapes.
+  max_connections: 1
+  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+  # always be the same as max_connections.
+  max_idle_connections: 1
+  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+  # If 0, connections are not closed due to a connection's age.
+  max_connection_lifetime: 5m
+
+# The target to monitor and the collectors to execute on it.
+target:
+  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+  # the schema gets dropped or replaced to match the driver expected DSN format.
+  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
+
+  # Collectors (referenced by name) to execute on the target.
+  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collectors: [neon_collector_autoscaling]
+
+# Collector files specifies a list of globs. One collector definition is read from each matching file.
+# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+collector_files:
+  - "neon_collector_autoscaling.yml"
--- a/compute/patches/cloud_regress_pg16.patch
+++ b/compute/patches/cloud_regress_pg16.patch
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
--- a/compute/patches/pg_cron.patch
+++ b/compute/patches/pg_cron.patch
--- a/compute/patches/pg_hint_plan.patch
+++ b/compute/patches/pg_hint_plan.patch
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
--- a/compute/vm-image-spec.yaml
+++ b/compute/vm-image-spec.yaml
@@ -0,0 +1,112 @@
+# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
+---
+commands:
+  - name: cgconfigparser
+    user: root
+    sysvInitAction: sysinit
+    shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
+  - name: pgbouncer
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+  - name: postgres-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+  - name: sql-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+shutdownHook: |
+  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
+files:
+  - filename: compute_ctl-resize-swap
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
+  - filename: cgconfig.conf
+    content: |
+      # Configuration for cgroups in VM compute nodes
+      group neon-postgres {
+          perm {
+              admin {
+                  uid = postgres;
+              }
+              task {
+                  gid = users;
+              }
+          }
+          memory {}
+      }
+build: |
+  # Build cgroup-tools
+  #
+  # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
+  # requires cgroup v2, so we'll build cgroup-tools ourselves.
+  FROM debian:bullseye-slim as libcgroup-builder
+  ENV LIBCGROUP_VERSION=v2.0.3
+
+  RUN set -exu \
+      && apt update \
+      && apt install --no-install-recommends -y \
+          git \
+          ca-certificates \
+          automake \
+          cmake \
+          make \
+          gcc \
+          byacc \
+          flex \
+          libtool \
+          libpam0g-dev \
+      && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+      && INSTALL_DIR="/libcgroup-install" \
+      && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+      && cd libcgroup \
+      # extracted from bootstrap.sh, with modified flags:
+      && (test -d m4 || mkdir m4) \
+      && autoreconf -fi \
+      && rm -rf autom4te.cache \
+      && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+      # actually build the thing...
+      && make install
+merge: |
+  # tweak nofile limits
+  RUN set -e \
+      && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
+      && test ! -e /etc/security || ( \
+         echo '*    - nofile 1048576' >>/etc/security/limits.conf \
+      && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
+         )
+
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
+
+  COPY cgconfig.conf /etc/cgconfig.conf
+
+  RUN set -e \
+      && chmod 0644 /etc/cgconfig.conf
+
+  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
+  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
+  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -346,7 +346,14 @@ impl StorageController {
            let pg_log_path = pg_data_path.join("postgres.log");

            if !tokio::fs::try_exists(&pg_data_path).await? {
-                let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
+                let initdb_args = [
+                    "-D",
+                    pg_data_path.as_ref(),
+                    "--username",
+                    &username(),
+                    "--no-sync",
+                    "--no-instructions",
+                ];
                tracing::info!(
                    "Initializing storage controller database with args: {:?}",
                    initdb_args
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -2,8 +2,8 @@
 # Example docker compose configuration

 The configuration in this directory is used for testing Neon docker images: it is
-not intended for deploying a usable system.  To run a development environment where
-you can experiment with a minature Neon system, use `cargo neon` rather than container images.
+not intended for deploying a usable system. To run a development environment where
+you can experiment with a miniature Neon system, use `cargo neon` rather than container images.

 This configuration does not start the storage controller, because the controller
 needs a way to reconfigure running computes, and no such thing exists in this setup.
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,9 +104,6 @@ pub struct ConfigToml {
    pub image_compression: ImageCompressionAlgorithm,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    #[serde(skip_serializing)]
-    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
-    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
    pub io_buffer_alignment: usize,
 }
@@ -384,7 +381,6 @@ impl Default for ConfigToml {
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
-            compact_level0_phase1_value_access: Default::default(),
            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),

            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -37,14 +37,11 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 /// ```mermaid
 /// stateDiagram-v2
 ///
-///     [*] --> Loading: spawn_load()
 ///     [*] --> Attaching: spawn_attach()
 ///
-///     Loading --> Activating: activate()
 ///     Attaching --> Activating: activate()
 ///     Activating --> Active: infallible
 ///
-///     Loading --> Broken: load() failure
 ///     Attaching --> Broken: attach() failure
 ///
 ///     Active --> Stopping: set_stopping(), part of shutdown & detach
@@ -68,10 +65,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk.
-    ///
-    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
-    Loading,
    /// This tenant is being attached to the pageserver.
    ///
    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
@@ -121,8 +114,6 @@ impl TenantState {
            // But, our attach task might still be fetching the remote timelines, etc.
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
-            // tenant mgr startup distinguishes attaching from loading via marker file.
-            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
            Self::Active => Attached,
@@ -191,10 +182,11 @@ impl LsnLease {
 }

 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+///
+/// XXX: We used to have more variants here, but now it's just one, which makes this rather
+/// useless. Remove, once we've checked that there's no client code left that looks at this.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
-    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
-    Loading,
    /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
    Attaching,
 }
@@ -1562,11 +1554,8 @@ mod tests {

    #[test]
    fn tenantstatus_activating_serde() {
-        let states = [
-            TenantState::Activating(ActivatingFrom::Loading),
-            TenantState::Activating(ActivatingFrom::Attaching),
-        ];
-        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+        let states = [TenantState::Activating(ActivatingFrom::Attaching)];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";

        let actual = serde_json::to_string(&states).unwrap();

@@ -1581,13 +1570,7 @@ mod tests {
    fn tenantstatus_activating_strum() {
        // tests added, because we use these for metrics
        let examples = [
-            (line!(), TenantState::Loading, "Loading"),
            (line!(), TenantState::Attaching, "Attaching"),
-            (
-                line!(),
-                TenantState::Activating(ActivatingFrom::Loading),
-                "Activating",
-            ),
            (
                line!(),
                TenantState::Activating(ActivatingFrom::Attaching),
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,7 +1,7 @@
 //! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
 //! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
 //! - `n_redos` => number of times the benchmark shell execute the `redo_work`
 //! - `nclients` => number of clients (more on this shortly).
 //!
@@ -10,7 +10,7 @@
 //! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
 //! We exercise the following combinations:
-//! - `redo_work = short / medium``
+//! - `redo_work = ping / short / medium``
 //! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
 //! We let `criterion` determine the `n_redos` using `iter_custom`.
@@ -27,33 +27,43 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-15 on i3en.3xlarge
+//! 2024-09-18 on im4gn.2xlarge
 //!
 //! ```text
-//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! ping/1                  time:   [21.789 µs 21.918 µs 22.078 µs]
+//! ping/2                  time:   [27.686 µs 27.812 µs 27.970 µs]
+//! ping/4                  time:   [35.468 µs 35.671 µs 35.926 µs]
+//! ping/8                  time:   [59.682 µs 59.987 µs 60.363 µs]
+//! ping/16                 time:   [101.79 µs 102.37 µs 103.08 µs]
+//! ping/32                 time:   [184.18 µs 185.15 µs 186.36 µs]
+//! ping/64                 time:   [349.86 µs 351.45 µs 353.47 µs]
+//! ping/128                time:   [684.53 µs 687.98 µs 692.17 µs]
+//! short/1                 time:   [31.833 µs 32.126 µs 32.428 µs]
+//! short/2                 time:   [35.558 µs 35.756 µs 35.992 µs]
+//! short/4                 time:   [44.850 µs 45.138 µs 45.484 µs]
+//! short/8                 time:   [65.985 µs 66.379 µs 66.853 µs]
+//! short/16                time:   [127.06 µs 127.90 µs 128.87 µs]
+//! short/32                time:   [252.98 µs 254.70 µs 256.73 µs]
+//! short/64                time:   [497.13 µs 499.86 µs 503.26 µs]
+//! short/128               time:   [987.46 µs 993.45 µs 1.0004 ms]
+//! medium/1                time:   [137.91 µs 138.55 µs 139.35 µs]
+//! medium/2                time:   [192.00 µs 192.91 µs 194.07 µs]
+//! medium/4                time:   [389.62 µs 391.55 µs 394.01 µs]
+//! medium/8                time:   [776.80 µs 780.33 µs 784.77 µs]
+//! medium/16               time:   [1.5323 ms 1.5383 ms 1.5459 ms]
+//! medium/32               time:   [3.0120 ms 3.0226 ms 3.0350 ms]
+//! medium/64               time:   [5.7405 ms 5.7787 ms 5.8166 ms]
+//! medium/128              time:   [10.412 ms 10.574 ms 10.718 ms]
 //! ```

 use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
+use once_cell::sync::Lazy;
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
+    future::Future,
    sync::Arc,
    time::{Duration, Instant},
 };
@@ -61,40 +71,59 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
+    macro_rules! bench_group {
+        ($name:expr, $redo_work:expr) => {{
+            let name: &str = $name;
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(name);
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
+                    },
+                );
+            }
+        }};
    }
+    //
+    // benchmark the protocol implementation
+    //
+    let pg_version = 14;
+    bench_group!(
+        "ping",
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let _: () = mgr.ping(pg_version).await.unwrap();
+        })
+    );
+    //
+    // benchmarks with actual record redo
+    //
+    let make_redo_work = |req: &'static Request| {
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let page = req.execute(&mgr).await.unwrap();
+            assert_eq!(page.remaining(), 8192);
+        })
+    };
+    bench_group!("short", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
+        make_redo_work(&REQUEST)
+    });
+    bench_group!("medium", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
+        make_redo_work(&REQUEST)
+    });
 }
 criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);

 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
@@ -135,17 +164,20 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
    })
 }

-async fn client(
+async fn client<F, Fut>(
    mgr: Arc<PostgresRedoManager>,
    start: Arc<Barrier>,
-    redo_work: Arc<Request>,
+    redo_work: Arc<F>,
    n_redos: u64,
-) -> Duration {
+) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
    start.wait().await;
    let start = Instant::now();
    for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
+        redo_work(Arc::clone(&mgr)).await;
        // The real pageserver will rarely if ever do 2 walredos in a row without
        // yielding to the executor.
        tokio::task::yield_now().await;
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -432,7 +432,7 @@ impl Client {
            self.mgmt_api_endpoint
        );

-        self.request(Method::POST, &uri, req)
+        self.request(Method::PUT, &uri, req)
            .await?
            .json()
            .await
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -324,7 +324,6 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access: _,
            l0_flush,
            virtual_file_direct_io,
            concurrent_tenant_warmup,
@@ -535,16 +534,6 @@ mod tests {
            .expect("parse_and_validate");
    }

-    #[test]
-    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
-        let input = indoc::indoc! {r#"
-            [compact_level0_phase1_value_access]
-            mode = "streaming-kmerge"
-            validate = "key-lsn-value"
-        "#};
-        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
-    }
-
    /// If there's a typo in the pageserver config, we'd rather catch that typo
    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
    /// made it in the believe that their config change is effective.
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2955,7 +2955,7 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
        )
-        .post(
+        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
            |r| api_handler(r, timeline_archival_config_handler),
        )
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1383,7 +1383,7 @@ impl SmgrQueryTimePerTimeline {
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
+    ) -> Option<impl Drop + 'a> {
        let start = Instant::now();

        self.global_started[op as usize].inc();
@@ -1534,7 +1534,7 @@ impl BasebackupQueryTime {
    pub(crate) fn start_recording<'c: 'a, 'a>(
        &'a self,
        ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
        let start = Instant::now();
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
@@ -3208,45 +3208,38 @@ pub(crate) mod tenant_throttling {

    impl TimelineGet {
        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            let per_tenant_label_values = &[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ];
            TimelineGet {
                count_accounted_start: {
                    GlobalAndPerTenantIntCounter {
                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                    }
                },
                count_accounted_finish: {
                    GlobalAndPerTenantIntCounter {
                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                    }
                },
                wait_time: {
                    GlobalAndPerTenantIntCounter {
                        global: WAIT_USECS.with_label_values(&[KIND]),
-                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: WAIT_USECS_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                    }
                },
                count_throttled: {
                    GlobalAndPerTenantIntCounter {
                        global: WAIT_COUNT.with_label_values(&[KIND]),
-                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: WAIT_COUNT_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                    }
                },
            }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -840,6 +840,36 @@ impl Timeline {
        Ok(total_size * BLCKSZ as u64)
    }

+    /// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
+    /// for gc-compaction.
+    ///
+    /// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
+    /// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
+    /// be kept only for a specific range of LSN.
+    ///
+    /// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
+    /// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
+    /// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
+    /// determine which keys to retain/drop for gc-compaction.
+    ///
+    /// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
+    /// to be retained for each of the branch LSN.
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
+    pub(crate) async fn collect_gc_compaction_keyspace(
+        &self,
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+        let metadata_key_begin = Key::metadata_key_range().start;
+        let aux_v1_key = AUX_FILES_KEY;
+        let dense_keyspace = KeySpace {
+            ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
+        };
+        Ok((
+            dense_keyspace,
+            SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
+        ))
+    }
+
    ///
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1968,9 +1968,6 @@ impl Tenant {
                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                }
-                TenantState::Loading => {
-                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
-                }
                TenantState::Attaching => {
                    *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                }
@@ -2151,7 +2148,7 @@ impl Tenant {
    async fn set_stopping(
        &self,
        progress: completion::Barrier,
-        allow_transition_from_loading: bool,
+        _allow_transition_from_loading: bool,
        allow_transition_from_attaching: bool,
    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();
@@ -2166,7 +2163,6 @@ impl Tenant {
                );
                false
            }
-            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -2185,13 +2181,6 @@ impl Tenant {
                *current_state = TenantState::Stopping { progress };
                true
            }
-            TenantState::Loading => {
-                if !allow_transition_from_loading {
-                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
-            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -2247,7 +2236,7 @@ impl Tenant {
        // The load & attach routines own the tenant state until it has reached `Active`.
        // So, wait until it's done.
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
@@ -2267,7 +2256,7 @@ impl Tenant {
        let reason = reason.to_string();
        self.state.send_modify(|current_state| {
            match *current_state {
-                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                TenantState::Activating(_) | TenantState::Attaching => {
                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
                }
                TenantState::Active => {
@@ -2311,7 +2300,7 @@ impl Tenant {
        loop {
            let current_state = receiver.borrow_and_update().clone();
            match current_state {
-                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
+                TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    self.activate_now();
                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
@@ -3627,7 +3616,7 @@ impl Tenant {
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<UninitializedTimeline> {
+    ) -> anyhow::Result<UninitializedTimeline<'a>> {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
@@ -4144,7 +4133,7 @@ pub(crate) mod harness {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

            let tenant = Arc::new(Tenant::new(
-                TenantState::Loading,
+                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    TenantConfOpt::from(self.tenant_conf.clone()),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,13 +1,13 @@
 //! Common traits and structs for layers

 pub mod delta_layer;
+pub mod filter_iterator;
 pub mod image_layer;
 pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
-
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -1021,13 +1021,30 @@ impl DeltaLayerInner {
                    continue;
                }
            };
-
+            let view = BufView::new_slice(&blobs_buf.buf);
            for meta in blobs_buf.blobs.iter().rev() {
                if Some(meta.meta.key) == ignore_key_with_err {
                    continue;
                }
+                let blob_read = meta.read(&view).await;
+                let blob_read = match blob_read {
+                    Ok(buf) => buf,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::Other(anyhow!(e).context(format!(
+                                "Failed to decompress blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                let value = Value::des(&blob_read);

-                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
                let value = match value {
                    Ok(v) => v,
                    Err(e) => {
@@ -1243,21 +1260,21 @@ impl DeltaLayerInner {
                buf.reserve(read.size());
                let res = reader.read_blobs(&read, buf, ctx).await?;

+                let view = BufView::new_slice(&res.buf);
+
                for blob in res.blobs {
                    let key = blob.meta.key;
                    let lsn = blob.meta.lsn;
-                    let data = &res.buf[blob.start..blob.end];
+
+                    let data = blob.read(&view).await?;

                    #[cfg(debug_assertions)]
-                    Value::des(data)
+                    Value::des(&data)
                        .with_context(|| {
                            format!(
-                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
-                                blob.meta.key,
-                                blob.meta.lsn,
-                                blob.start,
-                                blob.end,
-                                utils::Hex(data)
+                                "blob failed to deserialize for {}: {:?}",
+                                blob,
+                                utils::Hex(&data)
                            )
                        })
                        .unwrap();
@@ -1265,15 +1282,15 @@ impl DeltaLayerInner {
                    // is it an image or will_init walrecord?
                    // FIXME: this could be handled by threading the BlobRef to the
                    // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(data)
+                    let will_init = crate::repository::ValueBytes::will_init(&data)
                        .inspect_err(|_e| {
                            #[cfg(feature = "testing")]
-                            tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
+                            tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
                        })
                        .unwrap_or(false);

                    per_blob_copy.clear();
-                    per_blob_copy.extend_from_slice(data);
+                    per_blob_copy.extend_from_slice(&data);

                    let (tmp, res) = writer
                        .put_value_bytes(
@@ -1538,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> {
            .read_blobs(&plan, buf, self.ctx)
            .await?;
        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
-            let value = Value::des(&frozen_buf[meta.start..meta.end])?;
+            let blob_read = meta.read(&view).await?;
+            let value = Value::des(&blob_read)?;
+
            next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
        }
        self.key_values_batch = next_batch;
@@ -1916,9 +1936,13 @@ pub(crate) mod test {
                let blobs_buf = vectored_blob_reader
                    .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                    .await?;
+                let view = BufView::new_slice(&blobs_buf.buf);
                for meta in blobs_buf.blobs.iter() {
-                    let value = &blobs_buf.buf[meta.start..meta.end];
-                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
+                    let value = meta.read(&view).await?;
+                    assert_eq!(
+                        &value[..],
+                        &entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
+                    );
                }

                buf = Some(blobs_buf.buf);
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -0,0 +1,205 @@
+use std::ops::Range;
+
+use anyhow::bail;
+use pageserver_api::{
+    key::Key,
+    keyspace::{KeySpace, SparseKeySpace},
+};
+use utils::lsn::Lsn;
+
+use crate::repository::Value;
+
+use super::merge_iterator::MergeIterator;
+
+/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
+///
+/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
+/// to be retained.
+pub struct FilterIterator<'a> {
+    inner: MergeIterator<'a>,
+    retain_key_filters: Vec<Range<Key>>,
+    current_filter_idx: usize,
+}
+
+impl<'a> FilterIterator<'a> {
+    pub fn create(
+        inner: MergeIterator<'a>,
+        dense_keyspace: KeySpace,
+        sparse_keyspace: SparseKeySpace,
+    ) -> anyhow::Result<Self> {
+        let mut retain_key_filters = Vec::new();
+        retain_key_filters.extend(dense_keyspace.ranges);
+        retain_key_filters.extend(sparse_keyspace.0.ranges);
+        retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
+        // Verify key filters are non-overlapping and sorted
+        for window in retain_key_filters.windows(2) {
+            if window[0].end > window[1].start {
+                bail!(
+                    "Key filters are overlapping: {:?} and {:?}",
+                    window[0],
+                    window[1]
+                );
+            }
+        }
+        Ok(Self {
+            inner,
+            retain_key_filters,
+            current_filter_idx: 0,
+        })
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(item) = self.inner.next().await? {
+            while self.current_filter_idx < self.retain_key_filters.len()
+                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+            {
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                    ^ current filter
+                self.current_filter_idx += 1;
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                                        ^ current filter
+            }
+            if self.current_filter_idx >= self.retain_key_filters.len() {
+                // We already exhausted all filters, so we should return now
+                // [filter region] [filter region] [filter region]
+                //                                                    ^ item
+                //                                                 ^ current filter (nothing)
+                return Ok(None);
+            }
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+                // [filter region]    [filter region]     [filter region]
+                //                                              ^ item
+                //                                        ^ current filter
+                return Ok(Some(item));
+            }
+            // If the key is not contained in the key retaining filters, continue to the next item.
+            // [filter region]    [filter region]     [filter region]
+            //                                     ^ item
+            //                                        ^ current filter
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::produce_delta_layer,
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_filter_iter_equal(
+        filter_iter: &mut FilterIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = filter_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn filter_keyspace_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 100;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(5)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(110),
+                    get_key(1000)..get_key(2000),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[5..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..100].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(0)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(95),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[0..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..95].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+    }
+}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -36,7 +36,8 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
@@ -547,15 +548,15 @@ impl ImageLayerInner {

            let buf = BytesMut::with_capacity(buf_size);
            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
            let frozen_buf = blobs_buf.buf.freeze();
+            let view = BufView::new_bytes(frozen_buf);

            for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
+                let img_buf = meta.read(&view).await?;

                key_count += 1;
                writer
-                    .put_image(meta.meta.key, img_buf, ctx)
+                    .put_image(meta.meta.key, img_buf.into_bytes(), ctx)
                    .await
                    .context(format!("Storing key {}", meta.meta.key))?;
            }
@@ -602,13 +603,28 @@ impl ImageLayerInner {
            match res {
                Ok(blobs_buf) => {
                    let frozen_buf = blobs_buf.buf.freeze();
-
+                    let view = BufView::new_bytes(frozen_buf);
                    for meta in blobs_buf.blobs.iter() {
-                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        let img_buf = meta.read(&view).await;
+
+                        let img_buf = match img_buf {
+                            Ok(img_buf) => img_buf,
+                            Err(e) => {
+                                reconstruct_state.on_key_error(
+                                    meta.meta.key,
+                                    PageReconstructError::Other(anyhow!(e).context(format!(
+                                        "Failed to decompress blob from virtual file {}",
+                                        self.file.path,
+                                    ))),
+                                );
+
+                                continue;
+                            }
+                        };
                        reconstruct_state.update_key(
                            &meta.meta.key,
                            self.lsn,
-                            Value::Image(img_buf),
+                            Value::Image(img_buf.into_bytes()),
                        );
                    }
                }
@@ -1025,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> {
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let frozen_buf: Bytes = blobs_buf.buf.freeze();
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
-            let img_buf = frozen_buf.slice(meta.start..meta.end);
-            next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
+            let img_buf = meta.read(&view).await?;
+            next_batch.push_back((
+                meta.meta.key,
+                self.image_layer.lsn,
+                Value::Image(img_buf.into_bytes()),
+            ));
        }
        self.key_values_batch = next_batch;
        Ok(())
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -481,8 +481,7 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                let delta = now - prev;
                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
+                    n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
                    count_accounted = count_accounted_finish,  // don't break existing log scraping
                    count_throttled,
                    sum_throttled_usecs,
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -31,6 +31,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
@@ -1772,6 +1773,7 @@ impl Timeline {
            gc_cutoff,
            lowest_retain_lsn
        );
+
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
@@ -1820,7 +1822,12 @@ impl Timeline {
                image_layers.push(layer);
            }
        }
-        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
+        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let mut merge_iter = FilterIterator::create(
+            MergeIterator::create(&delta_layers, &image_layers, ctx),
+            dense_ks,
+            sparse_ks,
+        )?;
        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
        // Data of the same key.
        let mut accumulated_values = Vec::new();
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,8 @@ use crate::{
    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -557,6 +557,8 @@ impl Timeline {
            gather_result = gather => {
                match gather_result {
                    Ok(_) => {},
+                    // It can happen sometimes that we hit this instead of the cancellation token firing above
+                    Err(CalculateSyntheticSizeError::Cancelled) => {}
                    Err(e) => {
                        // We don't care about the result, but, if it failed, we should log it,
                        // since consumption metric might be hitting the cached value and
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,8 +16,9 @@
 //! Note that the vectored blob api does *not* go through the page cache.

 use std::collections::BTreeMap;
+use std::ops::Deref;

-use bytes::BytesMut;
+use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -35,11 +36,123 @@ pub struct BlobMeta {
    pub lsn: Lsn,
 }

-/// Blob offsets into [`VectoredBlobsBuf::buf`]
+/// A view into the vectored blobs read buffer.
+#[derive(Clone, Debug)]
+pub(crate) enum BufView<'a> {
+    Slice(&'a [u8]),
+    Bytes(bytes::Bytes),
+}
+
+impl<'a> BufView<'a> {
+    /// Creates a new slice-based view on the blob.
+    pub fn new_slice(slice: &'a [u8]) -> Self {
+        Self::Slice(slice)
+    }
+
+    /// Creates a new [`bytes::Bytes`]-based view on the blob.
+    pub fn new_bytes(bytes: bytes::Bytes) -> Self {
+        Self::Bytes(bytes)
+    }
+
+    /// Convert the view into `Bytes`.
+    ///
+    /// If using slice as the underlying storage, the copy will be an O(n) operation.
+    pub fn into_bytes(self) -> Bytes {
+        match self {
+            BufView::Slice(slice) => Bytes::copy_from_slice(slice),
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+
+    /// Creates a sub-view of the blob based on the range.
+    fn view(&self, range: std::ops::Range<usize>) -> Self {
+        match self {
+            BufView::Slice(slice) => BufView::Slice(&slice[range]),
+            BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
+        }
+    }
+}
+
+impl<'a> Deref for BufView<'a> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+}
+
+impl<'a> AsRef<[u8]> for BufView<'a> {
+    fn as_ref(&self) -> &[u8] {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes.as_ref(),
+        }
+    }
+}
+
+impl<'a> From<&'a [u8]> for BufView<'a> {
+    fn from(value: &'a [u8]) -> Self {
+        Self::new_slice(value)
+    }
+}
+
+impl From<Bytes> for BufView<'_> {
+    fn from(value: Bytes) -> Self {
+        Self::new_bytes(value)
+    }
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
+/// subject to [`VectoredBlob::compression_bits`].
 pub struct VectoredBlob {
-    pub start: usize,
-    pub end: usize,
+    /// Blob metadata.
    pub meta: BlobMeta,
+    /// Start offset.
+    start: usize,
+    /// End offset.
+    end: usize,
+    /// Compression used on the the blob.
+    compression_bits: u8,
+}
+
+impl VectoredBlob {
+    /// Reads a decompressed view of the blob.
+    pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
+        let view = buf.view(self.start..self.end);
+
+        match self.compression_bits {
+            BYTE_UNCOMPRESSED => Ok(view),
+            BYTE_ZSTD => {
+                let mut decompressed_vec = Vec::new();
+                let mut decoder =
+                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
+                decoder.write_all(&view).await?;
+                decoder.flush().await?;
+                // Zero-copy conversion from `Vec` to `Bytes`
+                Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
+            }
+            bits => {
+                let error = std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
+                );
+                Err(error)
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VectoredBlob {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}@{}, {}..{}",
+            self.meta.key, self.meta.lsn, self.start, self.end
+        )
+    }
 }

 /// Return type of [`VectoredBlobReader::read_blobs`]
@@ -514,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> {
            );
        }

-        let mut buf = self
+        let buf = self
            .file
            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
            .await?
@@ -529,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> {
        // of a blob is implicit: the start of the next blob if one exists
        // or the end of the read.

-        // Some scratch space, put here for reusing the allocation
-        let mut decompressed_vec = Vec::new();
-
        for (blob_start, meta) in blobs_at {
            let blob_start_in_buf = blob_start - start_offset;
            let first_len_byte = buf[blob_start_in_buf as usize];
@@ -557,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> {
                )
            };

-            let start_raw = blob_start_in_buf + size_length;
-            let end_raw = start_raw + blob_size;
-            let (start, end);
-            if compression_bits == BYTE_UNCOMPRESSED {
-                start = start_raw as usize;
-                end = end_raw as usize;
-            } else if compression_bits == BYTE_ZSTD {
-                let mut decoder =
-                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
-                decoder
-                    .write_all(&buf[start_raw as usize..end_raw as usize])
-                    .await?;
-                decoder.flush().await?;
-                start = buf.len();
-                buf.extend_from_slice(&decompressed_vec);
-                end = buf.len();
-                decompressed_vec.clear();
-            } else {
-                let error = std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    format!("invalid compression byte {compression_bits:x}"),
-                );
-                return Err(error);
-            }
+            let start = (blob_start_in_buf + size_length) as usize;
+            let end = start + blob_size as usize;

            metas.push(VectoredBlob {
                start,
                end,
                meta: *meta,
+                compression_bits,
            });
        }

@@ -1020,8 +1109,13 @@ mod tests {
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
            let read_blob = &result.blobs[0];
-            let read_buf = &result.buf[read_blob.start..read_blob.end];
-            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
+            let view = BufView::new_slice(&result.buf);
+            let read_buf = read_blob.read(&view).await?;
+            assert_eq!(
+                &blob[..],
+                &read_buf[..],
+                "mismatch for idx={idx} at offset={offset}"
+            );
            buf = result.buf;
        }
        Ok(())
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -205,6 +205,22 @@ impl PostgresRedoManager {
        }
    }

+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
+        self.do_with_walredo_process(pg_version, |proc| async move {
+            proc.ping(Duration::from_secs(1))
+                .await
+                .map_err(Error::Other)
+        })
+        .await
+    }
+
    pub fn status(&self) -> WalRedoManagerStatus {
        WalRedoManagerStatus {
            last_redo_at: {
@@ -297,6 +313,9 @@ impl PostgresRedoManager {
        }
    }

+    /// # Cancel-Safety
+    ///
+    /// This method is cancel-safe iff `closure` is cancel-safe.
    async fn do_with_walredo_process<
        F: FnOnce(Arc<Process>) -> Fut,
        Fut: Future<Output = Result<O, Error>>,
@@ -537,6 +556,17 @@ mod tests {
    use tracing::Instrument;
    use utils::{id::TenantId, lsn::Lsn};

+    #[tokio::test]
+    async fn test_ping() {
+        let h = RedoHarness::new().unwrap();
+
+        h.manager
+            .ping(14)
+            .instrument(h.span())
+            .await
+            .expect("ping should work");
+    }
+
    #[tokio::test]
    async fn short_v14_redo() {
        let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
 use crate::{
    config::PageServerConf,
    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    page_cache::PAGE_SZ,
    span::debug_assert_current_span_has_tenant_id,
    walrecord::NeonWalRecord,
 };
@@ -237,6 +238,26 @@ impl WalRedoProcess {
        res
    }

+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
+        let mut writebuf: Vec<u8> = Vec::with_capacity(4);
+        protocol::build_ping_msg(&mut writebuf);
+        let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo ping timed out");
+        };
+        let response = res?;
+        if response.len() != PAGE_SZ {
+            anyhow::bail!(
+                "WAL redo ping response should respond with page-sized response: {}",
+                response.len()
+            );
+        }
+        Ok(())
+    }
+
    /// # Cancel-Safety
    ///
    /// When not polled to completion (e.g. because in `tokio::select!` another
--- a/pageserver/src/walredo/process/protocol.rs
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -55,3 +55,8 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
    tag.ser_into(buf)
        .expect("serialize BufferTag should always succeed");
 }
+
+pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
+    buf.put_u8(b'H');
+    buf.put_u32(4);
+}
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,8 @@ OBJS = \
 	hll.o \
 	libpagestore.o \
 	neon.o \
+	neon_pgversioncompat.o \
+	neon_perf_counters.o \
 	neon_utils.o \
 	neon_walreader.o \
 	pagestore_smgr.o \
@@ -23,7 +25,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

 EXTRA_CLEAN = \
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -109,6 +109,7 @@ typedef struct FileCacheControl
 								 * reenabling */
 	uint32		size;			/* size of cache file in chunks */
 	uint32		used;			/* number of used chunks */
+	uint32		used_pages;		/* number of used pages */
 	uint32		limit;			/* shared copy of lfc_size_limit */
 	uint64		hits;
 	uint64		misses;
@@ -905,6 +906,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				/* Cache overflow: evict least recently used chunk */
 				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
 	
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				{
+					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+				}
 				CriticalAssert(victim->access_count == 0);
 				entry->offset = victim->offset; /* grab victim's chunk */
 				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -959,6 +964,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 				for (int i = 0; i < blocks_in_chunk; i++)
 				{
+					lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
 					entry->bitmap[(chunk_offs + i) >> 5] |=
 						(1 << ((chunk_offs + i) & 31));
 				}
@@ -1051,6 +1057,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->size;
 			break;
+		case 5:
+			key = "file_cache_used_pages";
+			if (lfc_ctl)
+				value = lfc_ctl->used_pages;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,6 +30,7 @@
 #include "utils/guc.h"

 #include "neon.h"
+#include "neon_perf_counters.h"
 #include "neon_utils.h"
 #include "pagestore_client.h"
 #include "walproposer.h"
@@ -331,6 +332,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 	}
 	if (shard->conn)
 	{
+		MyNeonCounters->pageserver_disconnects_total++;
 		PQfinish(shard->conn);
 		shard->conn = NULL;
 	}
@@ -737,6 +739,8 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn;

+	MyNeonCounters->pageserver_requests_sent_total++;
+
 	/* If the connection was lost for some reason, reconnect */
 	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
@@ -889,6 +893,7 @@ pageserver_flush(shardno_t shard_no)
 	}
 	else
 	{
+		MyNeonCounters->pageserver_send_flushes_total++;
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -922,7 +927,7 @@ check_neon_id(char **newval, void **extra, GucSource source)
 static Size
 PagestoreShmemSize(void)
 {
-	return sizeof(PagestoreShmemState);
+	return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
 }

 static bool
@@ -941,6 +946,9 @@ PagestoreShmemInit(void)
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
+
+	NeonPerfCountersShmemInit();
+
 	LWLockRelease(AddinShmemInitLock);
 	return found;
 }
--- a/pgxn/neon/neon--1.4--1.5.sql
+++ b/pgxn/neon/neon--1.4--1.5.sql
@@ -0,0 +1,39 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
+
+
+CREATE FUNCTION get_backend_perf_counters()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE FUNCTION get_perf_counters()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'neon_get_perf_counters'
+LANGUAGE C PARALLEL SAFE;
+
+-- Show various metrics, for each backend. Note that the values are not reset
+-- when a backend exits. When a new backend starts with the backend ID, it will
+-- continue accumulating the values from where the old backend left. If you are
+-- only interested in the changes from your own session, store the values at the
+-- beginning of the session somewhere, and subtract them on subsequent calls.
+--
+-- For histograms, 'bucket_le' is the upper bound of the histogram bucket.
+CREATE VIEW neon_backend_perf_counters AS
+  SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value
+  FROM get_backend_perf_counters() AS P (
+    procno integer,
+    pid integer,
+    metric text,
+    bucket_le float8,
+    value float8
+  );
+
+-- Summary across all backends. (This could also be implemented with
+-- an aggregate query over neon_backend_perf_counters view.)
+CREATE VIEW neon_perf_counters AS
+  SELECT P.metric, P.bucket_le, P.value
+  FROM get_perf_counters() AS P (
+    metric text,
+    bucket_le float8,
+    value float8
+  );
--- a/pgxn/neon/neon--1.5--1.4.sql
+++ b/pgxn/neon/neon--1.5--1.4.sql
@@ -0,0 +1,4 @@
+DROP VIEW IF EXISTS neon_perf_counters;
+DROP VIEW IF EXISTS neon_backend_perf_counters;
+DROP FUNCTION IF EXISTS get_perf_counters();
+DROP FUNCTION IF EXISTS get_backend_perf_counters();
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,5 +1,7 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
+# TODO: bump default version to 1.5, after we are certain that we don't
+# need to rollback the compute image
 default_version = '1.4'
 module_pathname = '$libdir/neon'
 relocatable = true
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -0,0 +1,261 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_perf_counters.c
+ *	  Collect statistics about Neon I/O
+ *
+ * Each backend has its own set of counters in shared memory.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+
+#include "neon_perf_counters.h"
+#include "neon_pgversioncompat.h"
+
+neon_per_backend_counters *neon_per_backend_counters_shared;
+
+Size
+NeonPerfCountersShmemSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));
+
+	return size;
+}
+
+void
+NeonPerfCountersShmemInit(void)
+{
+	bool		found;
+
+	neon_per_backend_counters_shared =
+		ShmemInitStruct("Neon perf counters",
+						mul_size(MaxBackends,
+								 sizeof(neon_per_backend_counters)),
+						&found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		/* shared memory is initialized to zeros, so nothing to do here */
+	}
+}
+
+/*
+ * Count a GetPage wait operation.
+ */
+void
+inc_getpage_wait(uint64 latency_us)
+{
+	int			lo = 0;
+	int			hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
+
+	/* Find the right bucket with binary search */
+	while (lo < hi)
+	{
+		int			mid = (lo + hi) / 2;
+
+		if (latency_us < getpage_wait_bucket_thresholds[mid])
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	MyNeonCounters->getpage_wait_us_bucket[lo]++;
+	MyNeonCounters->getpage_wait_us_sum += latency_us;
+	MyNeonCounters->getpage_wait_us_count++;
+}
+
+/*
+ * Support functions for the views, neon_backend_perf_counters and
+ * neon_perf_counters.
+ */
+
+typedef struct
+{
+	char	   *name;
+	bool		is_bucket;
+	double		bucket_le;
+	double		value;
+} metric_t;
+
+static metric_t *
+neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+{
+#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
+	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
+	uint64		bucket_accum;
+	int			i = 0;
+	Datum		getpage_wait_str;
+
+	metrics[i].name = "getpage_wait_seconds_count";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_wait_us_count;
+	i++;
+	metrics[i].name = "getpage_wait_seconds_sum";
+	metrics[i].is_bucket = false;
+	metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
+	i++;
+
+	bucket_accum = 0;
+	for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+	{
+		uint64		threshold = getpage_wait_bucket_thresholds[bucketno];
+
+		bucket_accum += counters->getpage_wait_us_bucket[bucketno];
+
+		metrics[i].name = "getpage_wait_seconds_bucket";
+		metrics[i].is_bucket = true;
+		metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
+		metrics[i].value = (double) bucket_accum;
+		i++;
+	}
+	metrics[i].name = "getpage_prefetch_requests_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_requests_total;
+	i++;
+	metrics[i].name = "getpage_sync_requests_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_sync_requests_total;
+	i++;
+	metrics[i].name = "getpage_prefetch_misses_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_misses_total;
+	i++;
+	metrics[i].name = "getpage_prefetch_discards_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_discards_total;
+	i++;
+	metrics[i].name = "pageserver_requests_sent_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_requests_sent_total;
+	i++;
+	metrics[i].name = "pageserver_requests_disconnects_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_disconnects_total;
+	i++;
+	metrics[i].name = "pageserver_send_flushes_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_send_flushes_total;
+	i++;
+	metrics[i].name = "file_cache_hits_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->file_cache_hits_total;
+	i++;
+
+	Assert(i == NUM_METRICS);
+
+	/* NULL entry marks end of array */
+	metrics[i].name = NULL;
+	metrics[i].value = 0;
+
+	return metrics;
+}
+
+/*
+ * Write metric to three output Datums
+ */
+static void
+metric_to_datums(metric_t *m, Datum *values, bool *nulls)
+{
+	values[0] = CStringGetTextDatum(m->name);
+	nulls[0] = false;
+	if (m->is_bucket)
+	{
+		values[1] = Float8GetDatum(m->bucket_le);
+		nulls[1] = false;
+	}
+	else
+	{
+		values[1] = (Datum) 0;
+		nulls[1] = true;
+	}
+	values[2] = Float8GetDatum(m->value);
+	nulls[2] = false;
+}
+
+PG_FUNCTION_INFO_V1(neon_get_backend_perf_counters);
+Datum
+neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[5];
+	bool		nulls[5];
+
+	/* We put all the tuples into a tuplestore in one go. */
+	InitMaterializedSRF(fcinfo, 0);
+
+	for (int procno = 0; procno < MaxBackends; procno++)
+	{
+		PGPROC	   *proc = GetPGProcByNumber(procno);
+		int			pid = proc->pid;
+		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
+		metric_t   *metrics = neon_perf_counters_to_metrics(counters);
+
+		values[0] = Int32GetDatum(procno);
+		nulls[0] = false;
+		values[1] = Int32GetDatum(pid);
+		nulls[1] = false;
+
+		for (int i = 0; metrics[i].name != NULL; i++)
+		{
+			metric_to_datums(&metrics[i], &values[2], &nulls[2]);
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+		}
+
+		pfree(metrics);
+	}
+
+	return (Datum) 0;
+}
+
+PG_FUNCTION_INFO_V1(neon_get_perf_counters);
+Datum
+neon_get_perf_counters(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[3];
+	bool		nulls[3];
+	Datum		getpage_wait_str;
+	neon_per_backend_counters totals = {0};
+	metric_t   *metrics;
+
+	/* We put all the tuples into a tuplestore in one go. */
+	InitMaterializedSRF(fcinfo, 0);
+
+	/* Aggregate the counters across all backends */
+	for (int procno = 0; procno < MaxBackends; procno++)
+	{
+		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
+
+		totals.getpage_wait_us_count += counters->getpage_wait_us_count;
+		totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
+		for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+			totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
+		totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
+		totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
+		totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
+		totals.getpage_prefetch_discards_total += counters->getpage_prefetch_discards_total;
+		totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
+		totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
+		totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
+		totals.file_cache_hits_total += counters->file_cache_hits_total;
+	}
+
+	metrics = neon_perf_counters_to_metrics(&totals);
+	for (int i = 0; metrics[i].name != NULL; i++)
+	{
+		metric_to_datums(&metrics[i], &values[0], &nulls[0]);
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+	pfree(metrics);
+
+	return (Datum) 0;
+}
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_perf_counters.h
+ *	  Performance counters for neon storage requests
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef NEON_PERF_COUNTERS_H
+#define NEON_PERF_COUNTERS_H
+
+#if PG_VERSION_NUM >= 170000
+#include "storage/procnumber.h"
+#else
+#include "storage/backendid.h"
+#include "storage/proc.h"
+#endif
+
+static const uint64 getpage_wait_bucket_thresholds[] = {
+	      20,       30,       60,       100,  /* 0      -  100 us */
+	     200,      300,      600,	   1000,  /* 100 us - 1 ms */
+	    2000,     3000,     6000,     10000,  /* 1 ms   - 10 ms */
+	   20000,    30000,    60000,    100000,  /* 10 ms  - 100 ms */
+	  200000,   300000,   600000,   1000000,  /* 100 ms - 1 s */
+	 2000000,  3000000,  6000000,  10000000,  /* 1 s - 10 s */
+    20000000, 30000000, 60000000, 100000000,  /* 10 s - 100 s */
+	UINT64_MAX,
+};
+#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
+
+typedef struct
+{
+	/*
+	 * Histogram for how long an smgrread() request needs to wait for response
+	 * from pageserver. When prefetching is effective, these wait times can be
+	 * lower than the network latency to the pageserver, even zero, if the
+	 * page is already readily prefetched whenever we need to read a page.
+	 *
+	 * Note: we accumulate these in microseconds, because that's convenient in
+	 * the backend, but the 'neon_backend_perf_counters' view will convert
+	 * them to seconds, to make them more idiomatic as prometheus metrics.
+	 */
+	uint64		getpage_wait_us_count;
+	uint64		getpage_wait_us_sum;
+	uint64		getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
+
+	/*
+	 * Total number of speculative prefetch Getpage requests and synchronous
+	 * GetPage requests sent.
+	 */
+	uint64		getpage_prefetch_requests_total;
+	uint64		getpage_sync_requests_total;
+
+	/* XXX: It's not clear to me when these misses happen. */
+	uint64		getpage_prefetch_misses_total;
+
+	/*
+	 * Number of prefetched responses that were discarded becuase the
+	 * prefetched page was not needed or because it was concurrently fetched /
+	 * modified by another backend.
+	 */
+	uint64		getpage_prefetch_discards_total;
+
+	/*
+	 * Total number of requests send to pageserver. (prefetch_requests_total
+	 * and sync_request_total count only GetPage requests, this counts all
+	 * request types.)
+	 */
+	uint64		pageserver_requests_sent_total;
+
+	/*
+	 * Number of times the connection to the pageserver was lost and the
+	 * backend had to reconnect. Note that this doesn't count the first
+	 * connection in each backend, only reconnects.
+	 */
+	uint64		pageserver_disconnects_total;
+
+	/*
+	 * Number of network flushes to the pageserver. Synchronous requests are
+	 * flushed immediately, but when prefetching requests are sent in batches,
+	 * this can be smaller than pageserver_requests_sent_total.
+	 */
+	uint64		pageserver_send_flushes_total;
+
+	/*
+	 * Number of requests satisfied from the LFC.
+	 *
+	 * This is redundant with the server-wide file_cache_hits, but this gives
+	 * per-backend granularity, and it's handy to have this in the same place
+	 * as counters for requests that went to the pageserver. Maybe move all
+	 * the LFC stats to this struct in the future?
+	 */
+	uint64		file_cache_hits_total;
+
+} neon_per_backend_counters;
+
+/* Pointer to the shared memory array of neon_per_backend_counters structs */
+extern neon_per_backend_counters *neon_per_backend_counters_shared;
+
+#if PG_VERSION_NUM >= 170000
+#define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
+#else
+#define MyNeonCounters (&neon_per_backend_counters_shared[MyProc->pgprocno])
+#endif
+
+extern void inc_getpage_wait(uint64 latency);
+
+extern Size NeonPerfCountersShmemSize(void);
+extern void NeonPerfCountersShmemInit(void);
+
+
+#endif							/* NEON_PERF_COUNTERS_H */
--- a/pgxn/neon/neon_pgversioncompat.c
+++ b/pgxn/neon/neon_pgversioncompat.c
@@ -0,0 +1,44 @@
+/*
+ * Support functions for the compatibility macros in neon_pgversioncompat.h
+ */
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/tuplestore.h"
+
+#include "neon_pgversioncompat.h"
+
+#if PG_MAJORVERSION_NUM < 15
+void
+InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Tuplestorestate *tupstore;
+	MemoryContext old_context,
+				per_query_ctx;
+	TupleDesc	stored_tupdesc;
+
+	/* check to see if caller supports returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+
+	/*
+	 * Store the tuplestore and the tuple descriptor in ReturnSetInfo.  This
+	 * must be done in the per-query memory context.
+	 */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	old_context = MemoryContextSwitchTo(per_query_ctx);
+
+	if (get_call_result_type(fcinfo, NULL, &stored_tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	tupstore = tuplestore_begin_heap(false, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = stored_tupdesc;
+	MemoryContextSwitchTo(old_context);
+}
+#endif
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -6,6 +6,8 @@
 #ifndef NEON_PGVERSIONCOMPAT_H
 #define NEON_PGVERSIONCOMPAT_H

+#include "fmgr.h"
+
 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
 #else
@@ -123,4 +125,8 @@
 #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
 #endif

+#if PG_MAJORVERSION_NUM < 15
+extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
+#endif
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -66,6 +66,7 @@
 #include "storage/md.h"
 #include "storage/smgr.h"

+#include "neon_perf_counters.h"
 #include "pagestore_client.h"
 #include "bitmap.h"

@@ -289,7 +290,6 @@ static PrefetchState *MyPState;

 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
@@ -780,21 +780,27 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 }

 /*
- * prefetch_register_buffer() - register and prefetch buffer
+ * prefetch_register_bufferv() - register and prefetch buffers
 *
 * Register that we may want the contents of BufferTag in the near future.
+ * This is used when issuing a speculative prefetch request, but also when
+ * performing a synchronous request and need the buffer right now.
 *
 * If force_request_lsns is not NULL, those values are sent to the
 * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
 * to calculate the LSNs to send.
 *
+ * When performing a prefetch rather than a synchronous request,
+ * is_prefetch==true. Currently, it only affects how the request is accounted
+ * in the perf counters.
+ *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */
-
 static uint64
 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
-						  BlockNumber nblocks, const bits8 *mask)
+						  BlockNumber nblocks, const bits8 *mask,
+						  bool is_prefetch)
 {
 	uint64		min_ring_index;
 	PrefetchRequest req;
@@ -815,6 +821,7 @@ Retry:
 		PrfHashEntry *entry = NULL;
 		uint64		ring_index;
 		neon_request_lsns *lsns;
+
 		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
 			continue;

@@ -858,6 +865,7 @@ Retry:
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 					slot = NULL;
+					MyNeonCounters->getpage_prefetch_discards_total++;
 				}
 			}

@@ -972,6 +980,11 @@ Retry:

 		min_ring_index = Min(min_ring_index, ring_index);

+		if (is_prefetch)
+			MyNeonCounters->getpage_prefetch_requests_total++;
+		else
+			MyNeonCounters->getpage_sync_requests_total++;
+
 		prefetch_do_request(slot, lsns);
 	}

@@ -1000,13 +1013,6 @@ Retry:
 }


-static uint64
-prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
-{
-	return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
-}
-
-
 /*
 * Note: this function can get canceled and use a long jump to the next catch
 * context. Take care.
@@ -2612,7 +2618,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			lfc_present[i] = ~(lfc_present[i]);

 		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
-											   lfc_present);
+											   lfc_present, true);
 		nblocks -= iterblocks;
 		blocknum += iterblocks;

@@ -2656,7 +2662,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

-	ring_index = prefetch_register_buffer(tag, NULL);
+	ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);

 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2747,17 +2753,20 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
 	 * value of the LwLsn cache when the entry is not found.
 	 */
-	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
+	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);

 	for (int i = 0; i < nblocks; i++)
 	{
 		void	   *buffer = buffers[i];
 		BlockNumber blockno = base_blockno + i;
 		neon_request_lsns *reqlsns = &request_lsns[i];
+		TimestampTz		start_ts, end_ts;

 		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
 			continue;

+		start_ts = GetCurrentTimestamp();
+
 		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
 			XLogWaitForReplayOf(reqlsns[0].request_lsn);

@@ -2794,6 +2803,7 @@ Retry:
 				/* drop caches */
 				prefetch_set_unused(slot->my_ring_index);
 				pgBufferUsage.prefetch.expired += 1;
+				MyNeonCounters->getpage_prefetch_discards_total++;
 				/* make it look like a prefetch cache miss */
 				entry = NULL;
 			}
@@ -2804,8 +2814,9 @@ Retry:
 			if (entry == NULL)
 			{
 				pgBufferUsage.prefetch.misses += 1;
+				MyNeonCounters->getpage_prefetch_misses_total++;

-				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
+				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
 				Assert(ring_index != UINT64_MAX);
 				slot = GetPrfSlot(ring_index);
 			}
@@ -2860,6 +2871,9 @@ Retry:
 		/* buffer was used, clean up for later reuse */
 		prefetch_set_unused(ring_index);
 		prefetch_cleanup_trailing_unused();
+
+		end_ts = GetCurrentTimestamp();
+		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
 	}
 }

@@ -2913,6 +2927,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/* Try to read from local file cache */
 	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
+		MyNeonCounters->file_cache_hits_total++;
 		return;
 	}

@@ -3097,7 +3112,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				/* assume heap */
 				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
 				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
-	
+
 				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 				{
 					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -24,6 +24,7 @@
 * PushPage ('P'): Copy a page image (in the payload) to buffer cache
 * ApplyRecord ('A'): Apply a WAL record (in the payload)
 * GetPage ('G'): Return a page image from buffer cache.
+ * Ping ('H'): Return the input message.
 *
 * Currently, you only get a response to GetPage requests; the response is
 * simply a 8k page, without any headers. Errors are logged to stderr.
@@ -133,6 +134,7 @@ static void ApplyRecord(StringInfo input_message);
 static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
+static void Ping(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
 static void CreateFakeSharedMemoryAndSemaphores();

@@ -394,6 +396,10 @@ WalRedoMain(int argc, char *argv[])
 				GetPage(&input_message);
 				break;

+			case 'H': 			/* Ping */
+				Ping(&input_message);
+				break;
+
 				/*
 				 * EOF means we're done. Perform normal shutdown.
 				 */
@@ -1057,6 +1063,36 @@ GetPage(StringInfo input_message)
 }


+static void
+Ping(StringInfo input_message)
+{
+	int			tot_written;
+	/* Response: the input message */
+	tot_written = 0;
+	do {
+		ssize_t		rc;
+		/* We don't need alignment, but it's bad practice to use char[BLCKSZ] */
+#if PG_VERSION_NUM >= 160000
+		static const PGIOAlignedBlock response;
+#else
+		static const PGAlignedBlock response;
+#endif
+		rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written);
+		if (rc < 0) {
+			/* If interrupted by signal, just retry */
+			if (errno == EINTR)
+				continue;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to stdout: %m")));
+		}
+		tot_written += rc;
+	} while (tot_written < BLCKSZ);
+
+	elog(TRACE, "Page sent back for ping");
+}
+
+
 /* Buffer used by buffered_read() */
 static char stdin_buf[16 * 1024];
 static size_t stdin_len = 0;	/* # of bytes in buffer */
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -444,7 +444,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
            Self::Web(url, ()) => {
                info!("performing web authentication");

-                let info = web::authenticate(ctx, &url, client).await?;
+                let info = web::authenticate(ctx, config, &url, client).await?;

                Backend::Web(url, info)
            }
--- a/proxy/src/auth/backend/web.rs
+++ b/proxy/src/auth/backend/web.rs
@@ -1,5 +1,6 @@
 use crate::{
    auth, compute,
+    config::AuthenticationConfig,
    console::{self, provider::NodeInfo},
    context::RequestMonitoring,
    error::{ReportableError, UserFacingError},
@@ -58,6 +59,7 @@ pub(crate) fn new_psql_session_id() -> String {

 pub(super) async fn authenticate(
    ctx: &RequestMonitoring,
+    auth_config: &'static AuthenticationConfig,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -89,6 +91,14 @@ pub(super) async fn authenticate(
    info!(parent: &span, "waiting for console's reply...");
    let db_info = waiter.await.map_err(WebAuthError::from)?;

+    if auth_config.ip_allowlist_check_enabled {
+        if let Some(allowed_ips) = &db_info.allowed_ips {
+            if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
+                return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+            }
+        }
+    }
+
    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

    // This config should be self-contained, because we won't
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -284,6 +284,8 @@ pub(crate) struct DatabaseInfo {
    /// be inconvenient for debug with local PG instance.
    pub(crate) password: Option<Box<str>>,
    pub(crate) aux: MetricsAuxInfo,
+    #[serde(default)]
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
 }

 // Manually implement debug to omit sensitive info.
@@ -294,6 +296,7 @@ impl fmt::Debug for DatabaseInfo {
            .field("port", &self.port)
            .field("dbname", &self.dbname)
            .field("user", &self.user)
+            .field("allowed_ips", &self.allowed_ips)
            .finish_non_exhaustive()
    }
 }
@@ -432,6 +435,22 @@ mod tests {
            "aux": dummy_aux(),
        }))?;

+        // with allowed_ips
+        let dbinfo = serde_json::from_value::<DatabaseInfo>(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "aux": dummy_aux(),
+            "allowed_ips": ["127.0.0.1"],
+        }))?;
+
+        assert_eq!(
+            dbinfo.allowed_ips,
+            Some(vec![IpPattern::Single("127.0.0.1".parse()?)])
+        );
+
        Ok(())
    }

--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -71,6 +71,37 @@ impl ComputeHookTenant {
        }
    }

+    fn is_sharded(&self) -> bool {
+        matches!(self, ComputeHookTenant::Sharded(_))
+    }
+
+    /// Clear compute hook state for the specified shard.
+    /// Only valid for [`ComputeHookTenant::Sharded`] instances.
+    fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
+        match self {
+            ComputeHookTenant::Sharded(sharded) => {
+                if sharded.stripe_size != stripe_size
+                    || sharded.shard_count != tenant_shard_id.shard_count
+                {
+                    tracing::warn!("Shard split detected while handling detach")
+                }
+
+                let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
+                    *shard_number == tenant_shard_id.shard_number
+                });
+
+                if let Some(shard_idx) = shard_idx {
+                    sharded.shards.remove(shard_idx);
+                } else {
+                    tracing::warn!("Shard not found while handling detach")
+                }
+            }
+            ComputeHookTenant::Unsharded(_) => {
+                unreachable!("Detach of unsharded tenants is handled externally");
+            }
+        }
+    }
+
    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
    /// and drops existing content.
    fn update(
@@ -614,6 +645,36 @@ impl ComputeHook {
        self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
            .await
    }
+
+    /// Reflect a detach for a particular shard in the compute hook state.
+    ///
+    /// The goal is to avoid sending compute notifications with stale information (i.e.
+    /// including detach pageservers).
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
+    pub(super) fn handle_detach(
+        &self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+    ) {
+        use std::collections::hash_map::Entry;
+
+        let mut state_locked = self.state.lock().unwrap();
+        match state_locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(_) => {
+                tracing::warn!("Compute hook tenant not found for detach");
+            }
+            Entry::Occupied(mut e) => {
+                let sharded = e.get().is_sharded();
+                if !sharded {
+                    e.remove();
+                } else {
+                    e.get_mut().remove_shard(tenant_shard_id, stripe_size);
+                }
+
+                tracing::debug!("Compute hook handled shard detach");
+            }
+        }
+    }
 }

 #[cfg(test)]
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1849,7 +1849,7 @@ pub fn make_router(
                RequestName("v1_tenant_timeline"),
            )
        })
-        .post(
+        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
            |r| {
                tenant_service_handler(
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -238,7 +238,7 @@ impl PageserverClient {
    ) -> Result<()> {
        measured_request!(
            "timeline_archival_config",
-            crate::metrics::Method::Post,
+            crate::metrics::Method::Put,
            &self.node_id_label,
            self.inner
                .timeline_archival_config(tenant_shard_id, timeline_id, req)
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -820,6 +820,16 @@ impl Reconciler {
            self.location_config(&node, conf, None, false).await?;
        }

+        // The condition below identifies a detach. We must have no attached intent and
+        // must have been attached to something previously. Pass this information to
+        // the [`ComputeHook`] such that it can update its tenant-wide state.
+        if self.intent.attached.is_none() && !self.detach.is_empty() {
+            // TODO: Consider notifying control plane about detaches. This would avoid situations
+            // where the compute tries to start-up with a stale set of pageservers.
+            self.compute_hook
+                .handle_detach(self.tenant_shard_id, self.shard.stripe_size);
+        }
+
        failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");

        Ok(())
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -2,7 +2,7 @@ use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
 use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, fmt::Debug};
 use utils::{http::error::ApiError, id::NodeId};

 /// Scenarios in which we cannot find a suitable location for a tenant shard
@@ -27,7 +27,7 @@ pub enum MaySchedule {
 }

 #[derive(Serialize)]
-struct SchedulerNode {
+pub(crate) struct SchedulerNode {
    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
    shard_count: usize,
    /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
@@ -38,6 +38,137 @@ struct SchedulerNode {
    may_schedule: MaySchedule,
 }

+pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self>;
+    fn is_overloaded(&self) -> bool;
+    fn node_id(&self) -> NodeId;
+}
+
+pub(crate) trait ShardTag {
+    type Score: NodeSchedulingScore;
+}
+
+pub(crate) struct AttachedShardTag {}
+impl ShardTag for AttachedShardTag {
+    type Score = NodeAttachmentSchedulingScore;
+}
+
+pub(crate) struct SecondaryShardTag {}
+impl ShardTag for SecondaryShardTag {
+    type Score = NodeSecondarySchedulingScore;
+}
+
+/// Scheduling score of a given node for shard attachments.
+/// Lower scores indicate more suitable nodes.
+/// Ordering is given by member declaration order (top to bottom).
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct NodeAttachmentSchedulingScore {
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
+    /// Size of [`ScheduleContext::attached_nodes`] for the current node.
+    /// This normally tracks the number of attached shards belonging to the
+    /// tenant being scheduled that are already on this node.
+    attached_shards_in_context: usize,
+    /// Utilisation score that combines shard count and disk utilisation
+    utilization_score: u64,
+    /// Total number of shards attached to this node. When nodes have identical utilisation, this
+    /// acts as an anti-affinity between attached shards.
+    total_attached_shard_count: usize,
+    /// Convenience to make selection deterministic in tests and empty systems
+    node_id: NodeId,
+}
+
+impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self> {
+        let utilization = match &mut node.may_schedule {
+            MaySchedule::Yes(u) => u,
+            MaySchedule::No => {
+                return None;
+            }
+        };
+
+        Some(Self {
+            affinity_score: context
+                .nodes
+                .get(node_id)
+                .copied()
+                .unwrap_or(AffinityScore::FREE),
+            attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
+            utilization_score: utilization.cached_score(),
+            total_attached_shard_count: node.attached_shard_count,
+            node_id: *node_id,
+        })
+    }
+
+    fn is_overloaded(&self) -> bool {
+        PageserverUtilization::is_overloaded(self.utilization_score)
+    }
+
+    fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+}
+
+/// Scheduling score of a given node for shard secondaries.
+/// Lower scores indicate more suitable nodes.
+/// Ordering is given by member declaration order (top to bottom).
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct NodeSecondarySchedulingScore {
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
+    /// Utilisation score that combines shard count and disk utilisation
+    utilization_score: u64,
+    /// Total number of shards attached to this node. When nodes have identical utilisation, this
+    /// acts as an anti-affinity between attached shards.
+    total_attached_shard_count: usize,
+    /// Convenience to make selection deterministic in tests and empty systems
+    node_id: NodeId,
+}
+
+impl NodeSchedulingScore for NodeSecondarySchedulingScore {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self> {
+        let utilization = match &mut node.may_schedule {
+            MaySchedule::Yes(u) => u,
+            MaySchedule::No => {
+                return None;
+            }
+        };
+
+        Some(Self {
+            affinity_score: context
+                .nodes
+                .get(node_id)
+                .copied()
+                .unwrap_or(AffinityScore::FREE),
+            utilization_score: utilization.cached_score(),
+            total_attached_shard_count: node.attached_shard_count,
+            node_id: *node_id,
+        })
+    }
+
+    fn is_overloaded(&self) -> bool {
+        PageserverUtilization::is_overloaded(self.utilization_score)
+    }
+
+    fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+}
+
 impl PartialEq for SchedulerNode {
    fn eq(&self, other: &Self) -> bool {
        let may_schedule_matches = matches!(
@@ -406,6 +537,28 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }

+    /// Compute a schedulling score for each node that the scheduler knows of
+    /// minus a set of hard excluded nodes.
+    fn compute_node_scores<Score>(
+        &mut self,
+        hard_exclude: &[NodeId],
+        context: &ScheduleContext,
+    ) -> Vec<Score>
+    where
+        Score: NodeSchedulingScore,
+    {
+        self.nodes
+            .iter_mut()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) {
+                    None
+                } else {
+                    Score::generate(k, v, context)
+                }
+            })
+            .collect()
+    }
+
    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
    /// are already in use by this shard -- we use this to avoid picking the same node
    /// as both attached and secondary location.  This is a hard constraint: if we cannot
@@ -415,7 +568,7 @@ impl Scheduler {
    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
    /// the same tenant on the same node.  This is a soft constraint: the context will never
    /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
+    pub(crate) fn schedule_shard<Tag: ShardTag>(
        &mut self,
        hard_exclude: &[NodeId],
        context: &ScheduleContext,
@@ -424,20 +577,7 @@ impl Scheduler {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
-            .nodes
-            .iter_mut()
-            .filter_map(|(k, v)| match &mut v.may_schedule {
-                MaySchedule::No => None,
-                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
-                MaySchedule::Yes(utilization) => Some((
-                    *k,
-                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                    utilization.cached_score(),
-                    v.attached_shard_count,
-                )),
-            })
-            .collect();
+        let mut scores = self.compute_node_scores::<Tag::Score>(hard_exclude, context);

        // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
        // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
@@ -445,20 +585,18 @@ impl Scheduler {
        // overloaded.
        let non_overloaded_scores = scores
            .iter()
-            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
+            .filter(|i| !i.is_overloaded())
            .copied()
            .collect::<Vec<_>>();
        if !non_overloaded_scores.is_empty() {
            scores = non_overloaded_scores;
        }

-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization score (this combines shard count and disk utilization)
-        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
-        //       empty nodes), this acts as an anti-affinity between attached shards.
-        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
+        // Sort the nodes by score. The one with the lowest scores will be the preferred node.
+        // Refer to [`NodeAttachmentSchedulingScore`] for attached locations and
+        // [`NodeSecondarySchedulingScore`] for secondary locations to understand how the nodes
+        // are ranked.
+        scores.sort();

        if scores.is_empty() {
            // After applying constraints, no pageservers were left.
@@ -481,12 +619,12 @@ impl Scheduler {
        }

        // Lowest score wins
-        let node_id = scores.first().unwrap().0;
+        let node_id = scores.first().unwrap().node_id();

        if !matches!(context.mode, ScheduleMode::Speculative) {
            tracing::info!(
            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
        );
        }

@@ -556,9 +694,9 @@ mod tests {

        let context = ScheduleContext::default();

-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));

        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -567,7 +705,8 @@ mod tests {
        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);

-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled =
+            scheduler.schedule_shard::<AttachedShardTag>(&t1_intent.all_pageservers(), &context)?;
        t1_intent.push_secondary(&mut scheduler, scheduled);

        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -621,7 +760,9 @@ mod tests {
            scheduler: &mut Scheduler,
            context: &ScheduleContext,
        ) {
-            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
+            let scheduled = scheduler
+                .schedule_shard::<AttachedShardTag>(&[], context)
+                .unwrap();
            let mut intent = IntentState::new();
            intent.set_attached(scheduler, Some(scheduled));
            scheduled_intents.push(intent);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -26,7 +26,7 @@ use crate::{
        ShardGenerationState, TenantFilter,
    },
    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
+    scheduler::{AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
    tenant_shard::{
        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
        ScheduleOptimizationAction,
@@ -2629,7 +2629,8 @@ impl Service {
            let scheduler = &mut locked.scheduler;
            // Right now we only perform the operation on a single node without parallelization
            // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
            let node = locked
                .nodes
                .get(&node_id)
@@ -2815,7 +2816,8 @@ impl Service {

            // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
            // was attached, just has to be able to see the S3 content)
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
            let node = nodes
                .get(&node_id)
                .expect("Pageservers may not be deleted while lock is active");
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -8,7 +8,10 @@ use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
    reconciler::{ReconcileUnits, ReconcilerConfig},
-    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
+    scheduler::{
+        AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
+        SecondaryShardTag,
+    },
    service::ReconcileResultRequest,
 };
 use pageserver_api::controller_api::{
@@ -335,19 +338,19 @@ pub(crate) enum ReconcileWaitError {
    Failed(TenantShardId, Arc<ReconcileError>),
 }

-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct ReplaceSecondary {
    old_node_id: NodeId,
    new_node_id: NodeId,
 }

-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct MigrateAttachment {
    pub(crate) old_attached_node_id: NodeId,
    pub(crate) new_attached_node_id: NodeId,
 }

-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) enum ScheduleOptimizationAction {
    // Replace one of our secondary locations with a different node
    ReplaceSecondary(ReplaceSecondary),
@@ -355,7 +358,7 @@ pub(crate) enum ScheduleOptimizationAction {
    MigrateAttachment(MigrateAttachment),
 }

-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct ScheduleOptimization {
    // What was the reconcile sequence when we generated this optimization?  The optimization
    // should only be applied if the shard's sequence is still at this value, in case other changes
@@ -537,13 +540,19 @@ impl TenantShard {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&self.intent.secondary, context)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }

+    #[instrument(skip_all, fields(
+        tenant_id=%self.tenant_shard_id.tenant_id,
+        shard_id=%self.tenant_shard_id.shard_slug(),
+        sequence=%self.sequence
+    ))]
    pub(crate) fn schedule(
        &mut self,
        scheduler: &mut Scheduler,
@@ -613,7 +622,8 @@ impl TenantShard {

                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler
+                        .schedule_shard::<SecondaryShardTag>(&used_pageservers, context)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -626,7 +636,7 @@ impl TenantShard {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(&[], context)?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -803,9 +813,10 @@ impl TenantShard {
            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
            // This implicitly limits the choice to nodes that are available, and prefers nodes
            // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
+            let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
+                &self.intent.all_pageservers(),
+                schedule_context,
+            ) else {
                // A scheduling error means we have no possible candidate replacements
                continue;
            };
@@ -1333,6 +1344,8 @@ impl TenantShard {

 #[cfg(test)]
 pub(crate) mod tests {
+    use std::{cell::RefCell, rc::Rc};
+
    use pageserver_api::{
        controller_api::NodeAvailability,
        shard::{ShardCount, ShardNumber},
@@ -1637,12 +1650,14 @@ pub(crate) mod tests {

    // Optimize til quiescent: this emulates what Service::optimize_all does, when
    // called repeatedly in the background.
+    // Returns the applied optimizations
    fn optimize_til_idle(
        nodes: &HashMap<NodeId, Node>,
        scheduler: &mut Scheduler,
        shards: &mut [TenantShard],
-    ) {
+    ) -> Vec<ScheduleOptimization> {
        let mut loop_n = 0;
+        let mut optimizations = Vec::default();
        loop {
            let mut schedule_context = ScheduleContext::default();
            let mut any_changed = false;
@@ -1657,6 +1672,7 @@ pub(crate) mod tests {
            for shard in shards.iter_mut() {
                let optimization = shard.optimize_attachment(nodes, &schedule_context);
                if let Some(optimization) = optimization {
+                    optimizations.push(optimization.clone());
                    shard.apply_optimization(scheduler, optimization);
                    any_changed = true;
                    break;
@@ -1664,6 +1680,7 @@ pub(crate) mod tests {

                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
                if let Some(optimization) = optimization {
+                    optimizations.push(optimization.clone());
                    shard.apply_optimization(scheduler, optimization);
                    any_changed = true;
                    break;
@@ -1678,6 +1695,8 @@ pub(crate) mod tests {
            loop_n += 1;
            assert!(loop_n < 1000);
        }
+
+        optimizations
    }

    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
@@ -1730,4 +1749,48 @@ pub(crate) mod tests {

        Ok(())
    }
+
+    /// Test that initial shard scheduling is optimal. By optimal we mean
+    /// that the optimizer cannot find a way to improve it.
+    ///
+    /// This test is an example of the scheduling issue described in
+    /// https://github.com/neondatabase/neon/issues/8969
+    #[test]
+    fn initial_scheduling_is_optimal() -> anyhow::Result<()> {
+        use itertools::Itertools;
+
+        let nodes = make_test_nodes(2);
+
+        let mut scheduler = Scheduler::new([].iter());
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+
+        let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let a_context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+        let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let b_context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+        let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone()));
+        let b_shards_with_context = b.iter_mut().map(|shard| (shard, b_context.clone()));
+
+        let schedule_order = a_shards_with_context.interleave(b_shards_with_context);
+
+        for (shard, context) in schedule_order {
+            let context = &mut *context.borrow_mut();
+            shard.schedule(&mut scheduler, context).unwrap();
+        }
+
+        let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
+        assert_eq!(applied_to_a, vec![]);
+
+        let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
+        assert_eq!(applied_to_b, vec![]);
+
+        for shard in a.iter_mut().chain(b.iter_mut()) {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
 }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,13 +1,12 @@
 use std::collections::{HashMap, HashSet};

-use anyhow::Context;
 use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
+use tracing::{info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;

@@ -29,9 +28,8 @@ pub(crate) struct TimelineAnalysis {
    /// yet.
    pub(crate) warnings: Vec<String>,

-    /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware
-    /// of races between reading the metadata and reading the objects.
-    pub(crate) garbage_keys: Vec<String>,
+    /// Objects whose keys were not recognized at all, i.e. not layer files, not indices, and not initdb archive.
+    pub(crate) unknown_keys: Vec<String>,
 }

 impl TimelineAnalysis {
@@ -39,7 +37,7 @@ impl TimelineAnalysis {
        Self {
            errors: Vec::new(),
            warnings: Vec::new(),
-            garbage_keys: Vec::new(),
+            unknown_keys: Vec::new(),
        }
    }

@@ -59,7 +57,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 ) -> TimelineAnalysis {
    let mut result = TimelineAnalysis::new();

-    info!("Checking timeline {id}");
+    info!("Checking timeline");

    if let Some(s3_active_branch) = s3_active_branch {
        info!(
@@ -80,7 +78,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
    match s3_data {
        Some(s3_data) => {
            result
-                .garbage_keys
+                .unknown_keys
                .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));

            match s3_data.blob_data {
@@ -204,10 +202,10 @@ pub(crate) async fn branch_cleanup_and_check_errors(
        warn!("Timeline metadata warnings: {0:?}", result.warnings);
    }

-    if !result.garbage_keys.is_empty() {
-        error!(
-            "The following keys should be removed from S3: {0:?}",
-            result.garbage_keys
+    if !result.unknown_keys.is_empty() {
+        warn!(
+            "The following keys are not recognized: {0:?}",
+            result.unknown_keys
        )
    }

@@ -294,10 +292,10 @@ impl TenantObjectListing {
 pub(crate) struct RemoteTimelineBlobData {
    pub(crate) blob_data: BlobDataParseResult,

-    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
+    /// Index objects that were not used when loading `blob_data`, e.g. those from old generations
    pub(crate) unused_index_keys: Vec<ListingObject>,

-    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
+    /// Objects whose keys were not recognized at all, i.e. not layer files, not indices
    pub(crate) unknown_keys: Vec<ListingObject>,
 }

@@ -329,11 +327,54 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
    }
 }

+/// Note (<https://github.com/neondatabase/neon/issues/8872>):
+/// Since we do not gurantee the order of the listing, we could list layer keys right before
+/// pageserver `RemoteTimelineClient` deletes the layer files and then the index.
+/// In the rare case, this would give back a transient error where the index key is missing.
+///
+/// To avoid generating false positive, we try streaming the listing for a second time.
 pub(crate) async fn list_timeline_blobs(
    remote_client: &GenericRemoteStorage,
    id: TenantShardTimelineId,
    root_target: &RootTarget,
 ) -> anyhow::Result<RemoteTimelineBlobData> {
+    let res = list_timeline_blobs_impl(remote_client, id, root_target).await?;
+    match res {
+        ListTimelineBlobsResult::Ready(data) => Ok(data),
+        ListTimelineBlobsResult::MissingIndexPart(_) => {
+            // Retry if index is missing.
+            let data = list_timeline_blobs_impl(remote_client, id, root_target)
+                .await?
+                .into_data();
+            Ok(data)
+        }
+    }
+}
+
+enum ListTimelineBlobsResult {
+    /// Blob data is ready to be intepreted.
+    Ready(RemoteTimelineBlobData),
+    /// List timeline blobs has layer files but is missing [`IndexPart`].
+    MissingIndexPart(RemoteTimelineBlobData),
+}
+
+impl ListTimelineBlobsResult {
+    /// Get the inner blob data regardless the status.
+    pub fn into_data(self) -> RemoteTimelineBlobData {
+        match self {
+            ListTimelineBlobsResult::Ready(data) => data,
+            ListTimelineBlobsResult::MissingIndexPart(data) => data,
+        }
+    }
+}
+
+/// Returns [`ListTimelineBlobsResult::MissingIndexPart`] if blob data has layer files
+/// but is missing [`IndexPart`], otherwise returns [`ListTimelineBlobsResult::Ready`].
+async fn list_timeline_blobs_impl(
+    remote_client: &GenericRemoteStorage,
+    id: TenantShardTimelineId,
+    root_target: &RootTarget,
+) -> anyhow::Result<ListTimelineBlobsResult> {
    let mut s3_layers = HashSet::new();

    let mut errors = Vec::new();
@@ -375,30 +416,28 @@ pub(crate) async fn list_timeline_blobs(
                    s3_layers.insert((new_layer, gen));
                }
                Err(e) => {
-                    tracing::info!("Error parsing key {maybe_layer_name}");
-                    errors.push(
-                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
-                    );
+                    tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}");
                    unknown_keys.push(obj);
                }
            },
            None => {
-                tracing::warn!("Unknown key {key}");
-                errors.push(format!("S3 list response got an object with odd key {key}"));
+                tracing::info!("S3 listed an unknown key: {key}");
                unknown_keys.push(obj);
            }
        }
    }

-    if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
-        tracing::debug!(
-            "Timeline is empty apart from initdb archive: expected post-deletion state."
-        );
-        return Ok(RemoteTimelineBlobData {
+    if index_part_keys.is_empty() && s3_layers.is_empty() {
+        tracing::debug!("Timeline is empty: expected post-deletion state.");
+        if initdb_archive {
+            tracing::info!("Timeline is post deletion but initdb archive is still present.");
+        }
+
+        return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
            blob_data: BlobDataParseResult::Relic,
            unused_index_keys: index_part_keys,
-            unknown_keys: Vec::new(),
-        });
+            unknown_keys,
+        }));
    }

    // Choose the index_part with the highest generation
@@ -424,19 +463,43 @@ pub(crate) async fn list_timeline_blobs(
    match index_part_object.as_ref() {
        Some(selected) => index_part_keys.retain(|k| k != selected),
        None => {
-            errors.push("S3 list response got no index_part.json file".to_string());
+            // It is possible that the branch gets deleted after we got some layer files listed
+            // and we no longer have the index file in the listing.
+            errors.push(
+                "S3 list response got no index_part.json file but still has layer files"
+                    .to_string(),
+            );
+            return Ok(ListTimelineBlobsResult::MissingIndexPart(
+                RemoteTimelineBlobData {
+                    blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+                    unused_index_keys: index_part_keys,
+                    unknown_keys,
+                },
+            ));
        }
    }

    if let Some(index_part_object_key) = index_part_object.as_ref() {
        let index_part_bytes =
-            download_object_with_retries(remote_client, &index_part_object_key.key)
-                .await
-                .context("index_part.json download")?;
+            match download_object_with_retries(remote_client, &index_part_object_key.key).await {
+                Ok(index_part_bytes) => index_part_bytes,
+                Err(e) => {
+                    // It is possible that the branch gets deleted in-between we list the objects
+                    // and we download the index part file.
+                    errors.push(format!("failed to download index_part.json: {e}"));
+                    return Ok(ListTimelineBlobsResult::MissingIndexPart(
+                        RemoteTimelineBlobData {
+                            blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+                            unused_index_keys: index_part_keys,
+                            unknown_keys,
+                        },
+                    ));
+                }
+            };

        match serde_json::from_slice(&index_part_bytes) {
            Ok(index_part) => {
-                return Ok(RemoteTimelineBlobData {
+                return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
                    blob_data: BlobDataParseResult::Parsed {
                        index_part: Box::new(index_part),
                        index_part_generation,
@@ -444,7 +507,7 @@ pub(crate) async fn list_timeline_blobs(
                    },
                    unused_index_keys: index_part_keys,
                    unknown_keys,
-                })
+                }))
            }
            Err(index_parse_error) => errors.push(format!(
                "index_part.json body parsing error: {index_parse_error}"
@@ -458,9 +521,9 @@ pub(crate) async fn list_timeline_blobs(
        );
    }

-    Ok(RemoteTimelineBlobData {
+    Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
        unused_index_keys: index_part_keys,
        unknown_keys,
-    })
+    }))
 }
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -41,6 +41,10 @@ struct Cli {
    #[arg(long)]
    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
    controller_jwt: Option<String>,
+
+    /// If set to true, the scrubber will exit with error code on fatal error.
+    #[arg(long, default_value_t = false)]
+    exit_code: bool,
 }

 #[derive(Subcommand, Debug)]
@@ -203,6 +207,7 @@ async fn main() -> anyhow::Result<()> {
                    tenant_ids,
                    json,
                    post_to_storcon,
+                    cli.exit_code,
                )
                .await
            }
@@ -269,6 +274,7 @@ async fn main() -> anyhow::Result<()> {
                gc_min_age,
                gc_mode,
                post_to_storcon,
+                cli.exit_code,
            )
            .await
        }
@@ -284,6 +290,7 @@ pub async fn run_cron_job(
    gc_min_age: humantime::Duration,
    gc_mode: GcMode,
    post_to_storcon: bool,
+    exit_code: bool,
 ) -> anyhow::Result<()> {
    tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
    pageserver_physical_gc_cmd(
@@ -301,6 +308,7 @@ pub async fn run_cron_job(
        Vec::new(),
        true,
        post_to_storcon,
+        exit_code,
    )
    .await?;

@@ -349,6 +357,7 @@ pub async fn scan_pageserver_metadata_cmd(
    tenant_shard_ids: Vec<TenantShardId>,
    json: bool,
    post_to_storcon: bool,
+    exit_code: bool,
 ) -> anyhow::Result<()> {
    if controller_client.is_none() && post_to_storcon {
        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
@@ -380,6 +389,9 @@ pub async fn scan_pageserver_metadata_cmd(

            if summary.is_fatal() {
                tracing::error!("Fatal scrub errors detected");
+                if exit_code {
+                    std::process::exit(1);
+                }
            } else if summary.is_empty() {
                // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                // scrubber they were likely expecting to scan something, and if we see no timelines
@@ -391,6 +403,9 @@ pub async fn scan_pageserver_metadata_cmd(
                        .prefix_in_bucket
                        .unwrap_or("<none>".to_string())
                );
+                if exit_code {
+                    std::process::exit(1);
+                }
            }

            Ok(())
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -12,6 +12,7 @@ use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
+use tracing::{info_span, Instrument};
 use utils::id::TenantId;
 use utils::shard::ShardCount;

@@ -169,45 +170,54 @@ pub async fn scan_pageserver_metadata(
        let mut timeline_ids = HashSet::new();
        let mut timeline_generations = HashMap::new();
        for (ttid, data) in timelines {
-            if ttid.tenant_shard_id.shard_count == highest_shard_count {
-                // Only analyze `TenantShardId`s with highest shard count.
+            async {
+                if ttid.tenant_shard_id.shard_count == highest_shard_count {
+                    // Only analyze `TenantShardId`s with highest shard count.

-                // Stash the generation of each timeline, for later use identifying orphan layers
-                if let BlobDataParseResult::Parsed {
-                    index_part,
-                    index_part_generation,
-                    s3_layers: _s3_layers,
-                } = &data.blob_data
-                {
-                    if index_part.deleted_at.is_some() {
-                        // skip deleted timeline.
-                        tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
-                        continue;
+                    // Stash the generation of each timeline, for later use identifying orphan layers
+                    if let BlobDataParseResult::Parsed {
+                        index_part,
+                        index_part_generation,
+                        s3_layers: _s3_layers,
+                    } = &data.blob_data
+                    {
+                        if index_part.deleted_at.is_some() {
+                            // skip deleted timeline.
+                            tracing::info!(
+                                "Skip analysis of {} b/c timeline is already deleted",
+                                ttid
+                            );
+                            return;
+                        }
+                        timeline_generations.insert(ttid, *index_part_generation);
                    }
-                    timeline_generations.insert(ttid, *index_part_generation);
+
+                    // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+                    // reference counts for layers across the tenant.
+                    let analysis = branch_cleanup_and_check_errors(
+                        remote_client,
+                        &ttid,
+                        &mut tenant_objects,
+                        None,
+                        None,
+                        Some(data),
+                    )
+                    .await;
+                    summary.update_analysis(&ttid, &analysis);
+
+                    timeline_ids.insert(ttid.timeline_id);
+                } else {
+                    tracing::info!(
+                        "Skip analysis of {} b/c a lower shard count than {}",
+                        ttid,
+                        highest_shard_count.0,
+                    );
                }
-
-                // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
-                // reference counts for layers across the tenant.
-                let analysis = branch_cleanup_and_check_errors(
-                    remote_client,
-                    &ttid,
-                    &mut tenant_objects,
-                    None,
-                    None,
-                    Some(data),
-                )
-                .await;
-                summary.update_analysis(&ttid, &analysis);
-
-                timeline_ids.insert(ttid.timeline_id);
-            } else {
-                tracing::info!(
-                    "Skip analysis of {} b/c a lower shard count than {}",
-                    ttid,
-                    highest_shard_count.0,
-                );
            }
+            .instrument(
+                info_span!("analyze-timeline", shard = %ttid.tenant_shard_id.shard_slug(), timeline = %ttid.timeline_id),
+            )
+            .await
        }

        summary.timeline_count += timeline_ids.len();
@@ -278,6 +288,7 @@ pub async fn scan_pageserver_metadata(
                        timelines,
                        highest_shard_count,
                    )
+                    .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
                    .await;
                    tenant_id = Some(ttid.tenant_shard_id.tenant_id);
                    highest_shard_count = ttid.tenant_shard_id.shard_count;
@@ -306,15 +317,18 @@ pub async fn scan_pageserver_metadata(
        tenant_timeline_results.push((ttid, data));
    }

+    let tenant_id = tenant_id.expect("Must be set if results are present");
+
    if !tenant_timeline_results.is_empty() {
        analyze_tenant(
            &remote_client,
-            tenant_id.expect("Must be set if results are present"),
+            tenant_id,
            &mut summary,
            tenant_objects,
            tenant_timeline_results,
            highest_shard_count,
        )
+        .instrument(info_span!("analyze-tenant", tenant = %tenant_id))
        .await;
    }

--- a/test_runner/cloud_regress/test_cloud_regress.py
+++ b/test_runner/cloud_regress/test_cloud_regress.py
@@ -0,0 +1,100 @@
+"""
+Run the regression tests on the cloud instance of Neon
+"""
+
+from pathlib import Path
+from typing import Any
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.pg_version import PgVersion
+
+
+@pytest.fixture
+def setup(remote_pg: RemotePostgres):
+    """
+    Setup and teardown of the tests
+    """
+    with psycopg2.connect(remote_pg.connstr()) as conn:
+        with conn.cursor() as cur:
+            log.info("Creating the extension")
+            cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
+            conn.commit()
+            # TODO: Migrate to branches and remove this code
+            log.info("Looking for subscriptions in the regress database")
+            cur.execute(
+                "SELECT subname FROM pg_catalog.pg_subscription WHERE "
+                "subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
+            )
+            if cur.rowcount > 0:
+                with psycopg2.connect(
+                    dbname="regression",
+                    host=remote_pg.default_options["host"],
+                    user=remote_pg.default_options["user"],
+                    password=remote_pg.default_options["password"],
+                ) as regress_conn:
+                    with regress_conn.cursor() as regress_cur:
+                        for sub in cur:
+                            regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
+                            regress_cur.execute(
+                                f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
+                            )
+                            regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
+                        regress_conn.commit()
+
+    yield
+    # TODO: Migrate to branches and remove this code
+    log.info("Looking for extra roles...")
+    with psycopg2.connect(remote_pg.connstr()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                "SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
+            )
+            roles: list[Any] = []
+            for role in cur:
+                log.info("Role found: %s", role[0])
+                roles.append(role[0])
+            for role in roles:
+                cur.execute(f"DROP ROLE {role}")
+            conn.commit()
+
+
+@pytest.mark.timeout(7200)
+@pytest.mark.remote_cluster
+def test_cloud_regress(
+    setup,
+    remote_pg: RemotePostgres,
+    pg_version: PgVersion,
+    pg_distrib_dir: Path,
+    base_dir: Path,
+    test_output_dir: Path,
+):
+    """
+    Run the regression tests
+    """
+    regress_bin = (
+        pg_distrib_dir / f"{pg_version.v_prefixed}/lib/postgresql/pgxs/src/test/regress/pg_regress"
+    )
+    test_path = base_dir / f"vendor/postgres-{pg_version.v_prefixed}/src/test/regress"
+
+    env_vars = {
+        "PGHOST": remote_pg.default_options["host"],
+        "PGPORT": str(
+            remote_pg.default_options["port"] if "port" in remote_pg.default_options else 5432
+        ),
+        "PGUSER": remote_pg.default_options["user"],
+        "PGPASSWORD": remote_pg.default_options["password"],
+        "PGDATABASE": remote_pg.default_options["dbname"],
+    }
+    regress_cmd = [
+        str(regress_bin),
+        f"--inputdir={test_path}",
+        f"--bindir={pg_distrib_dir}/{pg_version.v_prefixed}/bin",
+        "--dlpath=/usr/local/lib",
+        "--max-concurrent-tests=20",
+        f"--schedule={test_path}/parallel_schedule",
+        "--max-connections=5",
+    ]
+    remote_pg.pg_bin.run(regress_cmd, env=env_vars, cwd=test_output_dir)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -849,7 +849,7 @@ class NeonEnvBuilder:

        for directory_to_clean in reversed(directories_to_clean):
            if not os.listdir(directory_to_clean):
-                log.info(f"Removing empty directory {directory_to_clean}")
+                log.debug(f"Removing empty directory {directory_to_clean}")
                try:
                    directory_to_clean.rmdir()
                except Exception as e:
@@ -2553,7 +2553,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        desired_availability: Optional[PageserverAvailability],
        desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
        max_attempts: int,
-        backoff: int,
+        backoff: float,
    ):
        """
        Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability'
@@ -2948,7 +2948,7 @@ class NeonPageserver(PgProtocol, LogUtils):
            self.id
        ):
            self.env.storage_controller.poll_node_status(
-                self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1
+                self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1
            )

        return self
@@ -4617,7 +4617,8 @@ class StorageScrubber:
            "REGION": s3_storage.bucket_region,
            "BUCKET": s3_storage.bucket_name,
            "BUCKET_PREFIX": s3_storage.prefix_in_bucket,
-            "RUST_LOG": "DEBUG",
+            "RUST_LOG": "INFO",
+            "PAGESERVER_DISABLE_FILE_LOGGING": "1",
        }
        env.update(s3_storage.access_env_vars())

@@ -4637,10 +4638,8 @@ class StorageScrubber:
        (output_path, stdout, status_code) = subprocess_capture(
            self.log_dir,
            args,
-            echo_stderr=True,
-            echo_stdout=True,
            env=env,
-            check=False,
+            check=True,
            capture_stdout=True,
            timeout=timeout,
        )
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -631,7 +631,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        log.info(
            f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
        )
-        res = self.post(
+        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
            json=config,
        )
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -236,7 +236,7 @@ def get_scale_for_db(size_mb: int) -> int:


 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
+    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
 )


--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -0,0 +1,21 @@
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_metrics(neon_simple_env: NeonEnv):
+    """
+    Test compute metrics, exposed in the neon_backend_perf_counters and
+    neon_perf_counters views
+    """
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    # We don't check that the values make sense, this is just a very
+    # basic check that the server doesn't crash or something like that.
+    #
+    # 1.5 is the minimum version to contain these views.
+    cur.execute("CREATE EXTENSION neon VERSION '1.5'")
+    cur.execute("SELECT * FROM neon_perf_counters")
+    cur.execute("SELECT * FROM neon_backend_perf_counters")
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -198,9 +198,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):

 def run_pgbench(connstr: str, pg_bin: PgBin):
    log.info(f"Start a pgbench workload on pg {connstr}")
-    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
-    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
-    log.info("pgbench init done")
    pg_bin.run_capture(["pgbench", "-T60", connstr])


@@ -247,9 +244,15 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
            log.info(
                f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
            )
+
+            # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+            pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", primary.connstr()])
+            log.info("pgbench init done in primary")
+
            t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
            t.start()
-            # Wait until pgbench_accounts is created + filled on replica *and*
+
+            # Wait until we see that the pgbench_accounts is created + filled on replica *and*
            # index is created. Otherwise index creation would conflict with
            # read queries and hs feedback won't save us.
            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -10,11 +10,11 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin


-#
-# Test branching, when a transaction is in prepared state
-#
@pytest.mark.timeout(600)
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    """
+    Test resizing the Local File Cache
+    """
    env = neon_simple_env
    endpoint = env.endpoints.create_start(
        "main",
@@ -32,27 +32,48 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
        pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])

-    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    # Initializing the pgbench database can be very slow, especially on debug builds.
+    connstr = endpoint.connstr(options="-cstatement_timeout=300s")
+
+    thread = threading.Thread(target=run_pgbench, args=(connstr,), daemon=True)
    thread.start()

    conn = endpoint.connect()
    cur = conn.cursor()

-    for _ in range(n_resize):
+    # For as long as pgbench is running, twiddle the LFC size once a second.
+    # Note that we launch this immediately, already while the "pgbench -i"
+    # initialization step is still running. That's quite a different workload
+    # than the actual pgbench benchamark run, so this gives us coverage of both.
+    while thread.is_alive():
        size = random.randint(1, 512)
        cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
        cur.execute("select pg_reload_conf()")
        time.sleep(1)
-
-    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
-    cur.execute("select pg_reload_conf()")
-
    thread.join()

-    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
-    lfc_file_size = os.path.getsize(lfc_file_path)
-    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
-    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
-    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
-    assert lfc_file_size <= 512 * 1024 * 1024
+    # At the end, set it at 100 MB, and perform a final check that the disk usage
+    # of the file is in that ballbark.
+    #
+    # We retry the check a few times, because it might take a while for the
+    # system to react to changing the setting and shrinking the file.
+    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
+    cur.execute("select pg_reload_conf()")
+    nretries = 10
+    while True:
+        lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
+        lfc_file_size = os.path.getsize(lfc_file_path)
+        res = subprocess.run(
+            ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
+        )
+        lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+        log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+        assert lfc_file_size <= 512 * 1024 * 1024
+
+        if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0:
+            break
+
+        nretries = nretries - 1
+        time.sleep(1)
+
    assert int(lfc_file_blocks) <= 128 * 1024
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -50,8 +50,8 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
            # Ensure that the default version is also updated in the neon.control file
            assert cur.fetchone() == ("1.4",)
            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
-            all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
-            current_version = "1.4"
+            all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"]
+            current_version = "1.5"
            for idx, begin_version in enumerate(all_versions):
                for target_version in all_versions[idx + 1 :]:
                    if current_version != begin_version:
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -15,8 +15,13 @@ def test_unlogged(neon_simple_env: NeonEnv):
    cur = conn.cursor()

    cur.execute("CREATE UNLOGGED TABLE iut (id int);")
-    # create index to test unlogged index relation as well
+    # create index to test unlogged index relations as well
    cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("CREATE INDEX ON iut USING gist (int4range(id, id, '[]'));")
+    cur.execute("CREATE INDEX ON iut USING spgist (int4range(id, id, '[]'));")
+    cur.execute("CREATE INDEX ON iut USING gin ((id::text::jsonb));")
+    cur.execute("CREATE INDEX ON iut USING brin (id);")
+    cur.execute("CREATE INDEX ON iut USING hash (id);")
    cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
    cur.execute("INSERT INTO iut (id) values (42);")

@@ -39,3 +44,12 @@ def test_unlogged(neon_simple_env: NeonEnv):
        assert results == [(43, 2)]
    else:
        assert results == [(43, 1)]
+
+    # Flush all data and compact it, so we detect any errors related to
+    # unlogged indexes materialization.
+    ps_http = env.pageserver.http_client()
+    ps_http.timeline_compact(
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        force_image_layer_creation=True,
+    )
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
-    "17rc1",
-    "2cf120e7393ca5f537c6a38b457585576dc035fc"
+    "17.0",
+    "68b5038f27e493bde6ae552fe066f10cbdfe6a14"
  ],
  "v16": [
    "16.4",
-    "1d7081a3b076ddf5086e0b118d4329820e6a7427"
+    "e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
  ],
  "v15": [
    "15.8",
-    "16c3c6b64f1420a367a2a9b2510f20d94f791af8"
+    "22e580fe9ffcea7e02592110b1c9bf426d83cada"
  ],
  "v14": [
    "14.13",
-    "a38d15f3233a4c07f2bf3335fcbd874dd1f4e386"
+    "2199b83fb72680001ce0f43bf6187a21dfb8f45d"
  ]
 }
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -1,550 +0,0 @@
-# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
---
-commands:
-  - name: cgconfigparser
-    user: root
-    sysvInitAction: sysinit
-    shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
-  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
-  # running it as root.
-  - name: chmod-resize-swap
-    user: root
-    sysvInitAction: sysinit
-    shell: 'chmod 711 /neonvm/bin/resize-swap'
-  - name: pgbouncer
-    user: postgres
-    sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
-  - name: postgres-exporter
-    user: nobody
-    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
-  - name: sql-exporter
-    user: nobody
-    sysvInitAction: respawn
-    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
-  - name: sql-exporter-autoscaling
-    user: nobody
-    sysvInitAction: respawn
-    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
-shutdownHook: |
-  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
-files:
-  - filename: compute_ctl-resize-swap
-    content: |
-      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
-      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
-  - filename: pgbouncer.ini
-    content: |
-      [databases]
-      *=host=localhost port=5432 auth_user=cloud_admin
-      [pgbouncer]
-      listen_port=6432
-      listen_addr=0.0.0.0
-      auth_type=scram-sha-256
-      auth_user=cloud_admin
-      auth_dbname=postgres
-      client_tls_sslmode=disable
-      server_tls_sslmode=disable
-      pool_mode=transaction
-      max_client_conn=10000
-      default_pool_size=64
-      max_prepared_statements=0
-      admin_users=postgres
-      unix_socket_dir=/tmp/
-      unix_socket_mode=0777
-  - filename: cgconfig.conf
-    content: |
-      # Configuration for cgroups in VM compute nodes
-      group neon-postgres {
-          perm {
-              admin {
-                  uid = postgres;
-              }
-              task {
-                  gid = users;
-              }
-          }
-          memory {}
-      }
-  - filename: sql_exporter.yml
-    content: |
-      # Configuration for sql_exporter
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector.yml"
-  - filename: sql_exporter_autoscaling.yml
-    content: |
-      # Configuration for sql_exporter for autoscaling-agent
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector_autoscaling]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector_autoscaling.yml"
-  - filename: neon_collector.yml
-    content: |
-      collector_name: neon_collector
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'LFC chunks used (chunk = 1MB)'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-      - metric_name: lfc_cache_size_limit
-        type: gauge
-        help: 'LFC cache size limit in bytes'
-        key_labels:
-        values: [lfc_cache_size_limit]
-        query: |
-          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-      - metric_name: connection_counts
-        type: gauge
-        help: 'Connection counts'
-        key_labels:
-          - datname
-          - state
-        values: [count]
-        query: |
-          select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
-      - metric_name: pg_stats_userdb
-        type: gauge
-        help: 'Stats for several oldest non-system dbs'
-        key_labels:
-          - datname
-        value_label: kind
-        values:
-          - db_size
-          - deadlocks
-          # Rows
-          - inserted
-          - updated
-          - deleted
-        # We export stats for 10 non-system database. Without this limit
-        # it is too easy to abuse the system by creating lots of databases.
-        query: |
-          select pg_database_size(datname) as db_size, deadlocks,
-                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-                 datname
-            from pg_stat_database
-           where datname IN (
-             select datname
-               from pg_database
-              where datname <> 'postgres' and not datistemplate
-              order by oid
-              limit 10
-           );
-
-      - metric_name: max_cluster_size
-        type: gauge
-        help: 'neon.max_cluster_size setting'
-        key_labels:
-        values: [max_cluster_size]
-        query: |
-          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
-      - metric_name: db_total_size
-        type: gauge
-        help: 'Size of all databases'
-        key_labels:
-        values: [total]
-        query: |
-          select sum(pg_database_size(datname)) as total from pg_database;
-
-      # DEPRECATED
-      - metric_name: lfc_approximate_working_set_size
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels:
-        values: [approximate_working_set_size]
-        query: |
-          select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
-      - metric_name: lfc_approximate_working_set_size_windows
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels: [duration]
-        values: [size]
-        # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-        # of durations in a pretty-printed form.
-        query: |
-          select
-            x as duration,
-            neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-          from
-            (values ('5m'),('15m'),('1h')) as t (x);
-
-      - metric_name: compute_current_lsn
-        type: gauge
-        help: 'Current LSN of the database'
-        key_labels:
-        values: [lsn]
-        query: |
-          select
-            case
-              when pg_catalog.pg_is_in_recovery()
-              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-              else (pg_current_wal_lsn() - '0/0')::FLOAT8
-            end as lsn;
-
-      - metric_name: compute_receive_lsn
-        type: gauge
-        help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-        key_labels:
-        values: [lsn]
-        query: |
-          SELECT
-            CASE
-              WHEN pg_catalog.pg_is_in_recovery()
-              THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-              ELSE 0
-            END AS lsn;
-
-      - metric_name: replication_delay_bytes
-        type: gauge
-        help: 'Bytes between received and replayed LSN'
-        key_labels:
-        values: [replication_delay_bytes]
-        # We use a GREATEST call here because this calculation can be negative.
-        # The calculation is not atomic, meaning after we've gotten the receive
-        # LSN, the replay LSN may have advanced past the receive LSN we
-        # are using for the calculation.
-        query: |
-          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
-      - metric_name: replication_delay_seconds
-        type: gauge
-        help: 'Time since last LSN was replayed'
-        key_labels:
-        values: [replication_delay_seconds]
-        query: |
-          SELECT
-            CASE
-              WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-              ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-            END AS replication_delay_seconds;
-
-      - metric_name: checkpoints_req
-        type: gauge
-        help: 'Number of requested checkpoints'
-        key_labels:
-        values: [checkpoints_req]
-        query: |
-          SELECT checkpoints_req FROM pg_stat_bgwriter;
-
-      - metric_name: checkpoints_timed
-        type: gauge
-        help: 'Number of scheduled checkpoints'
-        key_labels:
-        values: [checkpoints_timed]
-        query: |
-          SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
-      - metric_name: compute_logical_snapshot_files
-        type: gauge
-        help: 'Number of snapshot files in pg_logical/snapshot'
-        key_labels:
-          - timeline_id
-        values: [num_logical_snapshot_files]
-        query: |
-          SELECT
-            (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-            -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-            -- temporary snapshot files are renamed to the actual snapshot files after they are
-            -- completely built. We only WAL-log the completely built snapshot files.
-            (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
-      - metric_name: logical_slot_restart_lsn
-        type: gauge
-        help: 'restart_lsn of logical slots'
-        key_labels:
-          - slot_name
-        values: [restart_lsn]
-        query: |
-          select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-          from pg_replication_slots
-          where slot_type = 'logical';
-
-      - metric_name: compute_subscriptions_count
-        type: gauge
-        help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-        key_labels:
-          - enabled
-        values: [subscriptions_count]
-        query: |
-          select subenabled::text as enabled, count(*) as subscriptions_count
-          from pg_subscription
-          group by subenabled;
-
-      - metric_name: retained_wal
-        type: gauge
-        help: 'Retained WAL in inactive replication slots'
-        key_labels:
-          - slot_name
-        values: [retained_wal]
-        query: |
-          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-          FROM pg_replication_slots
-          WHERE active = false;
-
-      - metric_name: wal_is_lost
-        type: gauge
-        help: 'Whether or not the replication slot wal_status is lost'
-        key_labels:
-          - slot_name
-        values: [wal_is_lost]
-        query: |
-          SELECT slot_name,
-          CASE
-            WHEN wal_status = 'lost' THEN 1
-            ELSE 0
-          END AS wal_is_lost
-          FROM pg_replication_slots;
-
-  - filename: neon_collector_autoscaling.yml
-    content: |
-      collector_name: neon_collector_autoscaling
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'LFC chunks used (chunk = 1MB)'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-      - metric_name: lfc_cache_size_limit
-        type: gauge
-        help: 'LFC cache size limit in bytes'
-        key_labels:
-        values: [lfc_cache_size_limit]
-        query: |
-          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-      - metric_name: lfc_approximate_working_set_size_windows
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels: [duration_seconds]
-        values: [size]
-        # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-        # size looking back 1..60 minutes, labeled with the number of minutes.
-        query: |
-          select
-            x::text as duration_seconds,
-            neon.approximate_working_set_size_seconds(x) as size
-          from
-            (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
-build: |
-  # Build cgroup-tools
-  #
-  # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
-  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
-  # requires cgroup v2, so we'll build cgroup-tools ourselves.
-  FROM debian:bullseye-slim as libcgroup-builder
-  ENV LIBCGROUP_VERSION=v2.0.3
-
-  RUN set -exu \
-      && apt update \
-      && apt install --no-install-recommends -y \
-          git \
-          ca-certificates \
-          automake \
-          cmake \
-          make \
-          gcc \
-          byacc \
-          flex \
-          libtool \
-          libpam0g-dev \
-      && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
-      && INSTALL_DIR="/libcgroup-install" \
-      && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
-      && cd libcgroup \
-      # extracted from bootstrap.sh, with modified flags:
-      && (test -d m4 || mkdir m4) \
-      && autoreconf -fi \
-      && rm -rf autom4te.cache \
-      && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
-      # actually build the thing...
-      && make install
-
-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-
-  FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
-
-  # Build pgbouncer
-  #
-  FROM debian:bullseye-slim AS pgbouncer
-  RUN set -e \
-      && apt-get update \
-      && apt-get install -y \
-          build-essential \
-          git \
-          libevent-dev \
-          libtool \
-          pkg-config
-
-  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
-  RUN set -e \
-      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
-      && cd pgbouncer \
-      && ./autogen.sh \
-      && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
-      && make -j $(nproc) dist_man_MANS= \
-      && make install dist_man_MANS=
-merge: |
-  # tweak nofile limits
-  RUN set -e \
-      && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
-      && test ! -e /etc/security || ( \
-         echo '*    - nofile 1048576' >>/etc/security/limits.conf \
-      && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
-         )
-
-  # Allow postgres user (compute_ctl) to run swap resizer.
-  # Need to install sudo in order to allow this.
-  #
-  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
-  RUN set -e \
-      && apt update \
-      && apt install --no-install-recommends -y \
-             sudo \
-      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
-
-  COPY cgconfig.conf /etc/cgconfig.conf
-  COPY pgbouncer.ini /etc/pgbouncer.ini
-  COPY sql_exporter.yml /etc/sql_exporter.yml
-  COPY neon_collector.yml /etc/neon_collector.yml
-  COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
-  COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
-
-  RUN set -e \
-      && chown postgres:postgres /etc/pgbouncer.ini \
-      && chmod 0666 /etc/pgbouncer.ini \
-      && chmod 0644 /etc/cgconfig.conf \
-      && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml \
-      && chmod 0644 /etc/sql_exporter_autoscaling.yml \
-      && chmod 0644 /etc/neon_collector_autoscaling.yml
-
-  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
-  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
-  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
-  COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-  COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
-  COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,6 +45,7 @@ futures-io = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -106,6 +107,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
Author	SHA1	Message	Date
Alexander Bayandin	d31e272919	DO NOT MERGE: test cache-from	2024-09-25 13:52:44 +01:00
Alexander Bayandin	ebea319d64	DO NOT MERGE: test cache-from	2024-09-25 12:19:58 +01:00
Vlad Lazar	a26cc29d92	storcon: add tags to scheduler logs (#9127 ) We log something at info level each time we schedule a shard to a non-secondary location. Might as well have context for it.	2024-09-25 10:16:06 +01:00
Alex Chi Z.	5f2f31e879	fix(test): storage scrubber should only log to stdout with info (#9067 ) As @koivunej mentioned in the storage channel, for regress test, we don't need to create a log file for the scrubber, and we should reduce noisy logs. ## Summary of changes * Disable log file creation for storage scrubber * Only log at info level --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-09-24 22:33:03 +00:00
Damian972	938b163b42	chore(docker-compose): fix typo in readme (#9133 ) Typo in the readme inside docker-compose folder ## Summary of changes - Update the readme	2024-09-24 18:05:23 -04:00
Heikki Linnakangas	5cbf5b45ae	Remove TenantState::Loading (#9118 ) The last real use was removed in commit `de90bf4663`. It was still used in a few unit tests, but they can use Attaching too.	2024-09-24 20:58:54 +00:00
Heikki Linnakangas	af5c54ed14	test: Make test_lfc_resize more robust (#9117 ) 1. Increase statement_timeout. It defaults to 120 s, which is not quite enough on slow or busy systems with debug build. On my laptop, the index creation takes about 100 s. On buildfarm, we've seen failures, e.g: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9084/10997888708/index.html#suites/821f97908a487f1d7d3a2a4dd1571e99/db1834bddfe8c5b9/ 2. Keep twiddling the LFC size through the whole test. Before, we would do it for the first 10 seconds, but that only covers a small part of the pgbench initialization phase. Change the loop so that the pgbench run time determines how long the test runs, and we keep changing the LFC for the whole time. In the passing, also fix bogus test description, copy-pasted from a completely unrelated test.	2024-09-24 23:38:16 +03:00
Alexander Bayandin	523cf71721	Fix compiler warnings on macOS (#9128 ) ## Problem Compilation of neon extension on macOS produces a warning ``` pgxn/neon/neon_perf_counters.c:50:1: error: non-void function does not return a value [-Werror,-Wreturn-type] ``` ## Summary of changes - Change the return type of `NeonPerfCountersShmemInit` to void	2024-09-24 18:11:31 +00:00
Arpad Müller	c47f355ec1	Catch Cancelled and don't print a warning for it (#9121 ) In the `imitate_synthetic_size_calculation_worker` function, we might obtain the `Cancelled` error variant instead of hitting the cancellation token based path. Therefore, catch `Cancelled` and handle it analogously to the cancellation case. Fixes #8886.	2024-09-24 17:28:56 +00:00
Yuchen Liang	4f67b0225b	pageserver: handle decompression outside vectored `read_blobs` (#8942 ) Part of #8130. ## Problem Currently, decompression is performed within the `read_blobs` implementation and the decompressed blob will be appended to the end of the `BytesMut` buffer. We will lose this flexibility of extending the buffer when we switch to using our own dio-aligned buffer (WIP in https://github.com/neondatabase/neon/pull/8730). To facilitate the adoption of aligned buffer, we need to refactor the code to perform decompression outside `read_blobs`. ## Summary of changes - `VectoredBlobReader::read_blobs` will return `VectoredBlob` without performing decompression and appending decompressed blob. It becomes the caller's responsibility to decompress the buffer. - Added a new `BufView` type that functions as `Cow<Bytes, &[u8]>`. - Perform decompression within `VectoredBlob::read` so that people don't have to explicitly thinking about compression when using the reader interface. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-24 16:41:38 +00:00
Heikki Linnakangas	2f7cecaf6a	test: Poll pageserver availability more aggressively at test startup Even with the 100 ms interval, on my laptop the pageserver always becomes available on second attempt, so this saves about 900 ms at every test startup.	2024-09-24 17:16:43 +03:00
Heikki Linnakangas	589594c2e1	test: Skip fsync when initdb'ing the storage controller db After initdb, we configure it with "fsync=off" anyway.	2024-09-24 17:16:43 +03:00
Heikki Linnakangas	70fe007519	test: Make test_hot_standby_feedback more forgiving of slow initialization (#9113 ) Don't start waiting for the index to appear in the secondary until it has been created in the primary. Before, if the "pgbench -i" step took more than 60 s, we would give up. There was a flaky test failure along those lines at: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9105/10997477941/index.html#suites/950eff205b552e248417890b8b8f189e/73cf4b5648fa6f74/ Hopefully, this avoids such failures in the future.	2024-09-24 16:41:59 +03:00
a-masterov	b224a5a377	Move the patch to compute (#9120 ) ## Problem All the other patches were moved to the compute directory, and only one was left in the patches subdirectory in the root directory. ## Summary of changes The patch was moved to the compute directory as others	2024-09-24 15:13:18 +02:00
Christian Schwarz	a65d437930	chore(#9077 ): cleanups & code dedup (#9082 ) Punted from https://github.com/neondatabase/neon/pull/9077	2024-09-24 13:05:07 +00:00
Matthias van de Meent	fc67f8dc60	Update PostgreSQL 17 from 17rc1 to 17.0 (#9119 ) The PostgreSQL 17 vendor module is now based on postgres/postgres @ d7ec59a63d745ba74fba0e280bbf85dc6d1caa3e, presumably the final code change before the V17 tag.	2024-09-24 14:15:52 +02:00
Folke Behrens	2b65a2b53e	proxy: check if IP is allowed during webauth flow (#9101 ) neondatabase/cloud#12018	2024-09-24 11:52:25 +02:00
Vlad Lazar	9490360df4	storcon: improve initial shard scheduling (#9081 ) ## Problem Scheduling on tenant creation uses different heuristics compared to the scheduling done during background optimizations. This results in scenarios where shards are created and then immediately migrated by the optimizer. ## Summary of changes 1. Make scheduler aware of the type of the shard it is scheduling (attached vs secondary). We wish to have different heuristics. 2. For attached shards, include the attached shard count from the context in the node score calculation. This brings initial shard scheduling in line with what the optimization passes do. 3. Add a test for (2). This looks like a bigger change than required, but the refactoring serves as the basis for az-aware shard scheduling where we also need to make the distinction between attached and secondary shards. Closes https://github.com/neondatabase/neon/issues/8969	2024-09-24 09:03:41 +00:00
a-masterov	91d947654e	Add regression tests for a cloud-based Neon instance (#8681 ) ## Problem We need to be able to run the regression tests against a cloud-based Neon staging instance to prepare the migration to the arm architecture. ## Summary of changes Some tests were modified to work on the cloud instance (i.e. added passwords, server-side copy changed to client-side, etc) --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2024-09-24 09:44:45 +02:00
Yuchen Liang	37aa6fd953	scrubber: retry when missing index key in the listing (#8873 ) Part of #8128, fixes #8872. ## Problem See #8872. ## Summary of changes - Retry `list_timeline_blobs` another time if - there are layer file keys listed but not index. - failed to download index. - Instrument code with `analyze-tenant` and `analyze-timeline` span. - Remove `initdb_archive` check, it could have been deleted. - Return with exit code 1 on fatal error if `--exit-code` parameter is set. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-09-23 21:58:12 +00:00
Heikki Linnakangas	3ad567290c	Move metric exporter and pgbouncer config files Instead of adding them to the VM image late in the build process, when putting together the final VM image, include them in the earlier compute image already. That makes it more convenient to edit the files, and to test them.	2024-09-24 00:35:52 +03:00
Heikki Linnakangas	3a110e45ed	Move files related to building compute image into compute/ dir Seems nice to keep all these together. This also provides a nice place for a README file to describe the compute image build process. For now, it briefly describes the contents of the directory, but can be expanded.	2024-09-24 00:35:52 +03:00
Heikki Linnakangas	e7e6319e20	Fix compiler warnings with nightly rustc about elided lifetimes having names (#9105 ) The warnings: warning: elided lifetime has a name --> pageserver/src/metrics.rs:1386:29 \| 1382 \| pub(crate) fn start_timer<'c: 'a, 'a>( \| -- lifetime `'a` declared here ... 1386 \| ) -> Option<impl Drop + '_> { \| ^^ this elided lifetime gets resolved as `'a` \| = note: `#[warn(elided_named_lifetimes)]` on by default warning: elided lifetime has a name --> pageserver/src/metrics.rs:1537:46 \| 1534 \| pub(crate) fn start_recording<'c: 'a, 'a>( \| -- lifetime `'a` declared here ... 1537 \| ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { \| ^^ this elided lifetime gets resolved as `'a` warning: elided lifetime has a name --> pageserver/src/metrics.rs:1537:50 \| 1534 \| pub(crate) fn start_recording<'c: 'a, 'a>( \| -- lifetime `'a` declared here ... 1537 \| ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { \| ^^ this elided lifetime gets resolved as `'a` warning: elided lifetime has a name --> pageserver/src/tenant.rs:3630:25 \| 3622 \| async fn prepare_new_timeline<'a>( \| -- lifetime `'a` declared here ... 3630 \| ) -> anyhow::Result<UninitializedTimeline> { \| ^^^^^^^^^^^^^^^^^^^^^ this elided lifetime gets resolved as `'a`	2024-09-23 23:31:32 +02:00
Matthias van de Meent	d865881d59	NOAI (#9084 ) We can't FlushOneBuffer when we're in redo-only mode on PageServer, so make execution of that function conditional on us not running in pageserver walredo mode.	2024-09-23 21:16:42 +00:00
Konstantin Knizhnik	1c5d6e59a0	Maintain number of used pages for LFC (#9088 ) ## Problem LFC cache entry is chunk (right now size of chunk is 1Mb). LFC statistics shows number of chunks, but not number of used pages. And autoscaling team wants to know how sparse LFC is: https://neondb.slack.com/archives/C04DGM6SMTM/p1726782793595969 It is possible to obtain it from the view `select count() from local_cache`. Nut it is expensive operation, enumerating all entries in LFC under lock. ## Summary of changes This PR added "file_cache_used_pages" to `neon_lfc_stats` view: ``` select from neon_lfc_stats; lfc_key \| lfc_value -----------------------+----------- file_cache_misses \| 3139029 file_cache_hits \| 4098394 file_cache_used \| 1024 file_cache_writes \| 3173728 file_cache_size \| 1024 file_cache_used_pages \| 25689 (6 rows) ``` Please notice that this PR doesn't change neon extension API, so no need to create new version of Neon extension. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-09-23 22:05:32 +03:00
Heikki Linnakangas	263dfba6ee	Add views for metrics about pageserver requests (#9008 ) The metrics include a histogram of how long we need to wait for a GetPage request, number of reconnects, and number of requests among other things. The metrics are not yet exported anywhere, but you can query them manually. Note: This does not bump the default version of the 'neon' extension. We will do that later, as a separate PR. The reason is that this allows us to roll back the compute image smoothly, if necessary. Once the image that includes the new extension .so file with the new functions has been rolled out, and we're confident that we don't need to roll back the image anymore, we can change default extension version and actually start using the new functions and views. This is what the view looks like: ``` postgres=# select * from neon_perf_counters ; metric \| bucket_le \| value ---------------------------------------+-----------+---------- getpage_wait_seconds_count \| \| 300 getpage_wait_seconds_sum \| \| 0.048506 getpage_wait_seconds_bucket \| 2e-05 \| 0 getpage_wait_seconds_bucket \| 3e-05 \| 0 getpage_wait_seconds_bucket \| 6e-05 \| 71 getpage_wait_seconds_bucket \| 0.0001 \| 124 getpage_wait_seconds_bucket \| 0.0002 \| 248 getpage_wait_seconds_bucket \| 0.0003 \| 279 getpage_wait_seconds_bucket \| 0.0006 \| 297 getpage_wait_seconds_bucket \| 0.001 \| 298 getpage_wait_seconds_bucket \| 0.002 \| 298 getpage_wait_seconds_bucket \| 0.003 \| 298 getpage_wait_seconds_bucket \| 0.006 \| 300 getpage_wait_seconds_bucket \| 0.01 \| 300 getpage_wait_seconds_bucket \| 0.02 \| 300 getpage_wait_seconds_bucket \| 0.03 \| 300 getpage_wait_seconds_bucket \| 0.06 \| 300 getpage_wait_seconds_bucket \| 0.1 \| 300 getpage_wait_seconds_bucket \| 0.2 \| 300 getpage_wait_seconds_bucket \| 0.3 \| 300 getpage_wait_seconds_bucket \| 0.6 \| 300 getpage_wait_seconds_bucket \| 1 \| 300 getpage_wait_seconds_bucket \| 2 \| 300 getpage_wait_seconds_bucket \| 3 \| 300 getpage_wait_seconds_bucket \| 6 \| 300 getpage_wait_seconds_bucket \| 10 \| 300 getpage_wait_seconds_bucket \| 20 \| 300 getpage_wait_seconds_bucket \| 30 \| 300 getpage_wait_seconds_bucket \| 60 \| 300 getpage_wait_seconds_bucket \| 100 \| 300 getpage_wait_seconds_bucket \| Infinity \| 300 getpage_prefetch_requests_total \| \| 69 getpage_sync_requests_total \| \| 231 getpage_prefetch_misses_total \| \| 0 getpage_prefetch_discards_total \| \| 0 pageserver_requests_sent_total \| \| 323 pageserver_requests_disconnects_total \| \| 0 pageserver_send_flushes_total \| \| 323 file_cache_hits_total \| \| 0 (39 rows) ```	2024-09-23 21:28:50 +03:00
Heikki Linnakangas	df3996265f	test: Downgrade info message on removing empty directories (#9093 ) It was pretty noisy. It changed from debug to info level in commit `78938d1b59`, but I believe that was not purpose.	2024-09-23 20:10:22 +02:00
Alex Chi Z.	29699529df	feat(pageserver): filter keys with gc-compaction (#9004 ) Part of https://github.com/neondatabase/neon/issues/8002 Close https://github.com/neondatabase/neon/issues/8920 Legacy compaction (as well as gc-compaction) rely on the GC process to remove unused layer files, but this relies on many factors (i.e., key partition) to ensure data in a dropped table can be eventually removed. In gc-compaction, we consider the keyspace information when doing the compaction process. If a key is not in the keyspace, we will skip that key and not include it in the final output. However, this is not easy to implement because gc-compaction considers branch points (i.e., retain_lsns) and the retained keyspaces could change across different LSNs. Therefore, for now, we only remove aux v1 keys in the compaction process. ## Summary of changes * Add `FilterIterator` to filter out keys. * Integrate `FilterIterator` with gc-compaction. * Add `collect_gc_compaction_keyspace` for a spec of keyspaces that can be retained during the gc-compaction process. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-09-23 16:30:44 +00:00
Nikita Kalyanov	f446e08fb8	change HTTP method to comply with spec (#9100 ) There is discrepancy with the spec, it has PUT	2024-09-23 15:53:06 +02:00
Christian Schwarz	4d5add9ca0	compact_level0_phase1: remove final traces of value access mode config (#8935 ) refs https://github.com/neondatabase/neon/issues/8184 stacked atop https://github.com/neondatabase/neon/pull/8934 This PR changes from ignoring the config field to rejecting configs that contain it. PR https://github.com/neondatabase/infra/pull/1903 removes the field usage from `pageserver.toml`. It rolls into prod sooner or in the same release as this PR.	2024-09-23 15:05:22 +02:00
Christian Schwarz	59b4c2eaf9	walredo: add a ping method (#8952 ) Not used in production, but in benchmarks, to demonstrate minimal RTT. (It would be nice to not have to copy the 8KiB of zeroes, but, that would require larger protocol changes). Found this useful in investigation https://github.com/neondatabase/neon/pull/8952.	2024-09-23 10:19:37 +00:00
Vlad Lazar	5432155b0d	storcon: update compute hook state on detach (#9045 ) ## Problem Previously, the storage controller may send compute notifications containing stale pageservers (i.e. pageserver serving the shard was detached). This happened because detaches did not update the compute hook state. ## Summary of Changes Update compute hook state on shard detach. Fixes #8928	2024-09-23 10:05:02 +01:00
Heikki Linnakangas	e16e82749f	Remove unused crates from workspace Cargo.toml These were not referenced in any of the other Cargo.toml files in the workspace. They were not being built because of that, so there was little harm in having them listed, but let's be tidy.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	9f653893b9	Update a few dependencies, removing some indirect dependencies cargo update ciborium iana-time-zone lazy_static schannel uuid cargo update hyper@0.14 cargo update --precise 2.9.7 ureq It might be worthwhile just update all our dependencies at some point, but this is aimed at pruning the dependency tree, to make the build a little faster. That's also why I didn't update ureq to the latest version: that would've added a dependency to yet another version of rustls.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	913af44219	Update "memoffset" crate To eliminate one version of it from our dependency tree.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	ecd615ab6d	Update "hostname" crate We were already building v0.4.0 as an indirect dependency, so this avoids having to build two different versions of it.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	c9b2ec9ff1	Check submodule forward progress (#8949 ) We frequently mess up our submodule references. This adds one safeguard: it checks that the submodule references are only updated "forwards", not to some older commit, or a commit that's not a descended of the previous one. As next step, I'm thinking that we should automate things so that when you merge a PR to the 'neon' repository that updates the submodule references, the REL_*_STABLE_neon branches are automatically updated to match the submodule references. That way, you never need to manually merge PRs in the postgres repository, it's all triggered from commits in the 'neon' repository. But that's not included here.	2024-09-22 21:46:53 +03:00