Do vacuum freeze before copying data

Use cursor to copy data
Fix mapping of TOAST tables
2026-03-08 02:40:37 +00:00 · 2023-07-05 22:17:06 +03:00 · 2023-07-05 21:49:55 +03:00 · 2023-07-05 19:26:29 +03:00 · 2023-07-05 15:47:13 +03:00 · 2023-07-05 15:36:42 +03:00
121 changed files with 3476 additions and 2138 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -180,7 +180,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480

    steps:
    - uses: actions/checkout@v3
@@ -321,8 +322,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
-
    steps:
    - uses: actions/checkout@v3

@@ -414,8 +413,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
-
    steps:
    - uses: actions/checkout@v3

@@ -501,8 +498,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
-
    steps:
    - uses: actions/checkout@v3

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -659,6 +659,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -716,10 +717,40 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --cleanup
+
+      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
+      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
+      # so we won't build extension twice, but extract them from compute-node.
+      #
+      # For now we use extensions image only for new custom extensitons
+      - name: Kaniko build extensions only
+        run: |
+          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
+          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
+          # it still fails with error:
+          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
+          #
+          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
+          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
+
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+                           --context . \
+                           --build-arg GIT_VERSION=${{ github.sha }} \
+                           --build-arg PG_VERSION=${{ matrix.version }} \
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
+                           --dockerfile Dockerfile.compute-node \
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+                           --cleanup \
+                           --target postgres-extensions

      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
@@ -736,7 +767,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.8.0
+      VM_BUILDER_VERSION: v0.11.1

    steps:
      - name: Checkout
@@ -838,8 +869,10 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -850,8 +883,10 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -873,16 +908,93 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

+  upload-postgres-extensions-to-s3:
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+       github.event_name != 'workflow_dispatch'
+    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
+    needs: [ tag, promote-images ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
+
+    env:
+      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
+      # Later all the extensions will be moved to extensions image.
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
+      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
+      S3_BUCKETS: |
+        ${{ github.ref_name == 'release' &&
+          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
+          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
+
+    steps:
+      - name: Pull postgres-extensions image
+        run: |
+          docker pull ${EXTENSIONS_IMAGE}
+          docker pull ${COMPUTE_NODE_IMAGE}
+
+      - name: Create postgres-extensions container
+        id: create-container
+        run: |
+          EID=$(docker create ${EXTENSIONS_IMAGE} true)
+          echo "EID=${EID}" >> $GITHUB_OUTPUT
+
+          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
+          echo "CID=${CID}" >> $GITHUB_OUTPUT
+
+      - name: Extract postgres-extensions from container
+        run: |
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
+
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
+
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done
+
+      - name: Upload postgres-extensions to S3
+        run: |
+          for BUCKET in $(echo ${S3_BUCKETS}); do
+            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
+          done
+
+      - name: Cleanup
+        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
+        run: |
+          docker rm ${{ steps.create-container.outputs.CID }} || true
+          docker rm ${{ steps.create-container.outputs.EID }} || true
+
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
+    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -914,7 +1026,7 @@ jobs:
            exit 1
          fi

-      - name: Create tag "release-${{ needs.tag.outputs.build-tag }}"
+      - name: Create git tag
        if: github.ref_name == 'release'
        uses: actions/github-script@v6
        with:
@@ -924,7 +1036,7 @@ jobs:
            github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
-              ref: "refs/tags/release-${{ needs.tag.outputs.build-tag }}",
+              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,17 +200,6 @@ dependencies = [
 "critical-section",
 ]

-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi 0.1.19",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -805,18 +794,6 @@ dependencies = [
 "libloading",
 ]

-[[package]]
-name = "clap"
-version = "3.2.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
-dependencies = [
- "bitflags",
- "clap_lex 0.2.4",
- "indexmap",
- "textwrap",
-]
-
 [[package]]
 name = "clap"
 version = "4.3.0"
@@ -837,7 +814,7 @@ dependencies = [
 "anstream",
 "anstyle",
 "bitflags",
- "clap_lex 0.5.0",
+ "clap_lex",
 "strsim",
 ]

@@ -853,15 +830,6 @@ dependencies = [
 "syn 2.0.16",
 ]

-[[package]]
-name = "clap_lex"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
-dependencies = [
- "os_str_bytes",
-]
-
 [[package]]
 name = "clap_lex"
 version = "0.5.0"
@@ -915,7 +883,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "compute_api",
 "futures",
 "hyper",
@@ -977,7 +945,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.3.0",
+ "clap",
 "comfy-table",
 "compute_api",
 "git-version",
@@ -1047,19 +1015,19 @@ dependencies = [

 [[package]]
 name = "criterion"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
 dependencies = [
 "anes",
- "atty",
 "cast",
 "ciborium",
- "clap 3.2.25",
+ "clap",
 "criterion-plot",
+ "is-terminal",
 "itertools",
- "lazy_static",
 "num-traits",
+ "once_cell",
 "oorandom",
 "plotters",
 "rayon",
@@ -1140,7 +1108,7 @@ dependencies = [
 "crossterm_winapi",
 "libc",
 "mio",
- "parking_lot",
+ "parking_lot 0.12.1",
 "signal-hook",
 "signal-hook-mio",
 "winapi",
@@ -1210,7 +1178,7 @@ dependencies = [
 "hashbrown 0.12.3",
 "lock_api",
 "once_cell",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
 ]

 [[package]]
@@ -1676,15 +1644,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "hermit-abi"
 version = "0.2.6"
@@ -1939,6 +1898,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
 "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
 ]

 [[package]]
@@ -2267,16 +2229,6 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2349,9 +2301,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"

 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
 "bitflags",
 "cfg-if",
@@ -2381,9 +2333,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
 "cc",
 "libc",
@@ -2504,31 +2456,19 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "os_str_bytes"
-version = "6.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
-
 [[package]]
 name = "outref"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "pagectl"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
- "clap 4.3.0",
+ "clap",
 "git-version",
 "pageserver",
 "postgres_ffi",
@@ -2547,7 +2487,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "close_fds",
 "const_format",
 "consumption_metrics",
@@ -2629,6 +2569,17 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "parking_lot"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
+dependencies = [
+ "instant",
+ "lock_api",
+ "parking_lot_core 0.8.6",
+]
+
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2636,7 +2587,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
+dependencies = [
+ "cfg-if",
+ "instant",
+ "libc",
+ "redox_syscall 0.2.16",
+ "smallvec",
+ "winapi",
 ]

 [[package]]
@@ -2652,6 +2617,16 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

+[[package]]
+name = "pbkdf2"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+dependencies = [
+ "digest",
+ "hmac",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -2957,7 +2932,7 @@ dependencies = [
 "lazy_static",
 "libc",
 "memchr",
- "parking_lot",
+ "parking_lot 0.12.1",
 "procfs",
 "thiserror",
 ]
@@ -3022,12 +2997,11 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
- "atty",
 "base64 0.13.1",
 "bstr",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "consumption_metrics",
 "futures",
 "git-version",
@@ -3045,7 +3019,8 @@ dependencies = [
 "native-tls",
 "once_cell",
 "opentelemetry",
- "parking_lot",
+ "parking_lot 0.12.1",
+ "pbkdf2",
 "pin-project-lite",
 "postgres-native-tls",
 "postgres_backend",
@@ -3056,6 +3031,7 @@ dependencies = [
 "regex",
 "reqwest",
 "reqwest-middleware",
+ "reqwest-retry",
 "reqwest-tracing",
 "routerify",
 "rstest",
@@ -3291,6 +3267,29 @@ dependencies = [
 "thiserror",
 ]

+[[package]]
+name = "reqwest-retry"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "chrono",
+ "futures",
+ "getrandom",
+ "http",
+ "hyper",
+ "parking_lot 0.11.2",
+ "reqwest",
+ "reqwest-middleware",
+ "retry-policies",
+ "task-local-extensions",
+ "tokio",
+ "tracing",
+ "wasm-timer",
+]
+
 [[package]]
 name = "reqwest-tracing"
 version = "0.4.4"
@@ -3309,6 +3308,17 @@ dependencies = [
 "tracing-opentelemetry",
 ]

+[[package]]
+name = "retry-policies"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
+dependencies = [
+ "anyhow",
+ "chrono",
+ "rand",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3507,7 +3517,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "const_format",
 "crc32c",
 "fs2",
@@ -3518,7 +3528,7 @@ dependencies = [
 "hyper",
 "metrics",
 "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
 "postgres",
 "postgres-protocol",
 "postgres_backend",
@@ -3937,7 +3947,7 @@ dependencies = [
 "anyhow",
 "async-stream",
 "bytes",
- "clap 4.3.0",
+ "clap",
 "const_format",
 "futures",
 "futures-core",
@@ -3947,7 +3957,7 @@ dependencies = [
 "hyper",
 "metrics",
 "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
 "prost",
 "tokio",
 "tokio-stream",
@@ -4118,12 +4128,6 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "textwrap"
-version = "0.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
-
 [[package]]
 name = "thiserror"
 version = "1.0.40"
@@ -4281,7 +4285,7 @@ dependencies = [
 "futures-channel",
 "futures-util",
 "log",
- "parking_lot",
+ "parking_lot 0.12.1",
 "percent-encoding",
 "phf",
 "pin-project-lite",
@@ -4539,7 +4543,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.3.0",
+ "clap",
 "pageserver_api",
 "utils",
 "workspace_hack",
@@ -4641,7 +4645,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -4810,7 +4813,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
- "atty",
 "bincode",
 "byteorder",
 "bytes",
@@ -4887,7 +4889,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.3.0",
+ "clap",
 "env_logger",
 "log",
 "once_cell",
@@ -4991,6 +4993,21 @@ version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"

+[[package]]
+name = "wasm-timer"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
+dependencies = [
+ "futures",
+ "js-sys",
+ "parking_lot 0.11.2",
+ "pin-utils",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "web-sys"
 version = "0.3.63"
@@ -5252,7 +5269,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "clap_builder",
 "crossbeam-utils",
 "either",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
-atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
@@ -87,6 +86,7 @@ opentelemetry = "0.18.0"
 opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
+pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
@@ -95,6 +95,7 @@ regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
+reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustls = "0.20"
@@ -128,7 +129,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -170,7 +171,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
-criterion = "0.4"
+criterion = "0.5.1"
 rcgen = "0.10"
 rstest = "0.17"
 tempfile = "3.4"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -2,6 +2,7 @@ ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
+ARG BUILD_TAG

 #########################################################################################
 #
@@ -188,8 +189,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
-    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -480,6 +481,60 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control

+#########################################################################################
+#
+# Layer "pg-uuidv7-pg-build"
+# compile pg_uuidv7 extension
+#
+#########################################################################################
+FROM build-deps AS pg-uuidv7-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
+    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
+
+#########################################################################################
+#
+# Layer "pg-roaringbitmap-pg-build"
+# compile pg_roaringbitmap extension
+#
+#########################################################################################
+FROM build-deps AS pg-roaringbitmap-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
+    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
+
+#########################################################################################
+#
+# Layer "pg-anon-pg-build"
+# compile anon extension
+#
+#########################################################################################
+FROM build-deps AS pg-anon-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
+    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sort  > /before.txt && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
+    find /usr/local/pgsql -type f | sort  > /after.txt && \
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -588,6 +643,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
+# Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -613,6 +669,8 @@ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -634,6 +692,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG
+
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
@@ -658,6 +719,22 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
+#
+# Extenstion only
+#
+#########################################################################################
+FROM scratch AS postgres-extensions
+# After the transition this layer will include all extensitons.
+# As for now, it's only for new custom ones
+#
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
+
 #########################################################################################
 #
 # Final layer
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -3,6 +3,7 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
+ARG BUILD_TAG

 FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
 WORKDIR /home/nonroot
@@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
 ARG CACHEPOT_BUCKET=neon-github-dev
 #ARG AWS_ACCESS_KEY_ID
 #ARG AWS_SECRET_ACCESS_KEY
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG

 COPY . .

--- a/README.md
+++ b/README.md
@@ -132,13 +132,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > cargo neon init
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"

 # start pageserver, safekeeper, and broker for their intercommunication
 > cargo neon start
-Starting neon broker at 127.0.0.1:50051
+Starting neon broker at 127.0.0.1:50051.
 storage_broker started, pid: 2918372
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
 pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
@@ -152,8 +152,7 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
-Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'

 # check list of running postgres instances
 > cargo neon endpoint list
@@ -189,18 +188,17 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 # start postgres on that branch
 > cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
-Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

 # check the new list of running postgres instances
 > cargo neon endpoint list
 ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
 main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
- migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
+ migration_check  127.0.0.1:55434  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running

 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
 key | value
 -----+-------
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -54,9 +54,15 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

+const BUILD_TAG_DEFAULT: &str = "local";
+
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

+    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
+
+    info!("build_tag: {build_tag}");
+
    let matches = cli().get_matches();

    let http_port = *matches
@@ -250,6 +256,16 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

+    // Maybe sync safekeepers again, to speed up next startup
+    let compute_state = compute.state.lock().unwrap().clone();
+    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
+        info!("syncing safekeepers on shutdown");
+        let storage_auth_token = pspec.storage_auth_token.clone();
+        let lsn = compute.sync_safekeepers(storage_auth_token)?;
+        info!("synced safekeepers at lsn {lsn}");
+    }
+
    if let Err(err) = compute.check_for_core_dumps() {
        error!("error while checking for core dumps: {err:?}");
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -133,6 +133,84 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
+/// that we give to customers
+fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| format!("'{}'", escape_literal(&r.name)))
+        .collect::<Vec<_>>();
+
+    let dbs = spec
+        .cluster
+        .databases
+        .iter()
+        .map(|db| format!("'{}'", escape_literal(&db.name)))
+        .collect::<Vec<_>>();
+
+    let roles_decl = if roles.is_empty() {
+        String::from("roles text[] := NULL;")
+    } else {
+        format!(
+            r#"
+               roles text[] := ARRAY(SELECT rolname
+                                     FROM pg_catalog.pg_roles
+                                     WHERE rolname IN ({}));"#,
+            roles.join(", ")
+        )
+    };
+
+    let database_decl = if dbs.is_empty() {
+        String::from("dbs text[] := NULL;")
+    } else {
+        format!(
+            r#"
+               dbs text[] := ARRAY(SELECT datname
+                                   FROM pg_catalog.pg_database
+                                   WHERE datname IN ({}));"#,
+            dbs.join(", ")
+        )
+    };
+
+    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
+    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
+    let query = format!(
+        r#"
+            DO $$
+                DECLARE
+                    r text;
+                    {}
+                    {}
+                BEGIN
+                    IF NOT EXISTS (
+                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
+                    THEN
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
+                        IF array_length(roles, 1) IS NOT NULL THEN
+                            EXECUTE format('GRANT neon_superuser TO %s',
+                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
+                            FOREACH r IN ARRAY roles LOOP
+                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
+                            END LOOP;
+                        END IF;
+                        IF array_length(dbs, 1) IS NOT NULL THEN
+                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
+                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
+                        END IF;
+                    END IF;
+                END
+            $$;"#,
+        roles_decl, database_decl,
+    );
+    info!("Neon superuser created:\n{}", &query);
+    client
+        .simple_query(&query)
+        .map_err(|e| anyhow::anyhow!(e).context(query))?;
+    Ok(())
+}
+
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -157,7 +235,7 @@ impl ComputeNode {

    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Utc::now();
@@ -199,8 +277,8 @@ impl ComputeNode {

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
-    #[instrument(skip(self, storage_auth_token))]
-    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
+    #[instrument(skip_all)]
+    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

        let sync_handle = Command::new(&self.pgbin)
@@ -244,7 +322,7 @@ impl ComputeNode {

    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all)]
    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
@@ -302,7 +380,7 @@ impl ComputeNode {

    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
@@ -326,7 +404,7 @@ impl ComputeNode {
    }

    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all)]
    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
@@ -347,6 +425,8 @@ impl ComputeNode {
                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;

                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
+                // Disable forwarding so that users don't get a cloud_admin role
+                client.simple_query("SET neon.forward_ddl = false")?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);
@@ -357,14 +437,16 @@ impl ComputeNode {
            Ok(client) => client,
        };

-        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        client.simple_query("SET neon.forward_ddl = false")?;
+
+        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
+        create_neon_superuser(spec, &mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;

        // 'Close' connection
@@ -376,7 +458,7 @@ impl ComputeNode {
    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
-    #[instrument(skip(self, client))]
+    #[instrument(skip_all)]
    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
@@ -384,7 +466,7 @@ impl ComputeNode {

    /// Similar to `apply_config()`, but does a bit different sequence of operations,
    /// as it's used to reconfigure a previously started and configured Postgres node.
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

@@ -402,7 +484,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(&spec, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }

@@ -419,7 +501,7 @@ impl ComputeNode {
        Ok(())
    }

-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -434,9 +516,9 @@ impl ComputeNode {
        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();
-
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

+        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
            self.apply_config(&compute_state)?;
        }
@@ -444,11 +526,16 @@ impl ComputeNode {
        let startup_end_time = Utc::now();
        {
            let mut state = self.state.lock().unwrap();
-            state.metrics.config_ms = startup_end_time
+            state.metrics.start_postgres_ms = config_time
                .signed_duration_since(start_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
+            state.metrics.config_ms = startup_end_time
+                .signed_duration_since(config_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
            state.metrics.total_startup_ms = startup_end_time
                .signed_duration_since(compute_state.start_time)
                .to_std()
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;

 use crate::compute::ComputeNode;

-#[instrument(skip(compute))]
+#[instrument(skip_all)]
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -18,6 +18,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));

    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(false)
        .with_target(false)
        .with_writer(std::io::stderr);

--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -17,7 +17,7 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Escape a string for including it in a SQL literal
-fn escape_literal(s: &str) -> String {
+pub fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
-#[instrument(skip(pg))]
+#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    let pid_path = pgdata.join("postmaster.pid");

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -269,17 +269,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                let mut query: String = format!(
+                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    name.pg_quote()
+                );
                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
-
-                let grant_query = format!(
-                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                    name.pg_quote()
-                );
-                xact.execute(grant_query.as_str(), &[])?;
-                info!("role grant query: '{}'", &grant_query);
            }
        }

@@ -476,6 +472,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                query.push_str(&db.to_pg_options());
                let _guard = info_span!("executing", query).entered();
                client.execute(query.as_str(), &[])?;
+                let grant_query: String = format!(
+                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
+                    name.pg_quote()
+                );
+                client.execute(grant_query.as_str(), &[])?;
            }
        };

@@ -495,35 +496,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
    info!("cluster spec grants:");

-    // We now have a separate `web_access` role to connect to the database
-    // via the web interface and proxy link auth. And also we grant a
-    // read / write all data privilege to every role. So also grant
-    // create to everyone.
-    // XXX: later we should stop messing with Postgres ACL in such horrible
-    // ways.
-    let roles = spec
-        .cluster
-        .roles
-        .iter()
-        .map(|r| r.name.pg_quote())
-        .collect::<Vec<_>>();
-
-    for db in &spec.cluster.databases {
-        let dbname = &db.name;
-
-        let query: String = format!(
-            "GRANT CREATE ON DATABASE {} TO {}",
-            dbname.pg_quote(),
-            roles.join(", ")
-        );
-        info!("grant query {}", &query);
-
-        client.execute(query.as_str(), &[])?;
-    }
-
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -180,6 +180,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    }

    // Wait until process is gone
+    wait_until_stopped(process_name, pid)?;
+    Ok(())
+}
+
+pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -308,7 +308,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    env.init(pg_version)
+    let force = init_match.get_flag("force");
+    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
@@ -1013,6 +1014,13 @@ fn cli() -> Command {
        .help("If set, the node will be a hot replica on the specified timeline")
        .required(false);

+    let force_arg = Arg::new("force")
+        .value_parser(value_parser!(bool))
+        .long("force")
+        .action(ArgAction::SetTrue)
+        .help("Force initialization even if the repository is not empty")
+        .required(false);
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1028,6 +1036,7 @@ fn cli() -> Command {
                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
+                .arg(force_arg)
        )
        .subcommand(
            Command::new("timeline")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -67,6 +67,7 @@ pub struct EndpointConf {
    pg_port: u16,
    http_port: u16,
    pg_version: u32,
+    skip_pg_catalog_updates: bool,
 }

 //
@@ -135,6 +136,7 @@ impl ComputeControlPlane {
            mode,
            tenant_id,
            pg_version,
+            skip_pg_catalog_updates: false,
        });

        ep.create_endpoint_dir()?;
@@ -148,6 +150,7 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
+                skip_pg_catalog_updates: false,
            })?,
        )?;
        std::fs::write(
@@ -183,6 +186,9 @@ pub struct Endpoint {
    // the endpoint runs in.
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
+
+    // Optimizations
+    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
@@ -216,6 +222,7 @@ impl Endpoint {
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
+            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
        })
    }

@@ -398,6 +405,16 @@ impl Endpoint {
                String::from_utf8_lossy(&pg_ctl.stderr),
            );
        }
+
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
+        // TODO use background_process::stop_process instead
+        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
+        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
+        let pid = nix::unistd::Pid::from_raw(pid as i32);
+        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
+
        Ok(())
    }

@@ -450,7 +467,7 @@ impl Endpoint {

        // Create spec file
        let spec = ComputeSpec {
-            skip_pg_catalog_updates: false,
+            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
            cluster: Cluster {
@@ -500,7 +517,13 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
-        let _child = cmd.spawn()?;
+        let child = cmd.spawn()?;
+
+        // Write down the pid so we can wait for it when we want to stop
+        // TODO use background_process::start_process instead
+        let pid = child.id();
+        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
+        std::fs::write(pidfile_path, pid.to_string())?;

        // Wait for it to start
        let mut attempt = 0;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -364,7 +364,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -372,11 +372,29 @@ impl LocalEnv {
            "repository base path is missing"
        );

-        ensure!(
-            !base_path.exists(),
-            "directory '{}' already exists. Perhaps already initialized?",
-            base_path.display()
-        );
+        if base_path.exists() {
+            if force {
+                println!("removing all contents of '{}'", base_path.display());
+                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
+                // all contents inside. This helps if the developer symbol links another directory (i.e.,
+                // S3 local SSD) to the `.neon` base directory.
+                for entry in std::fs::read_dir(base_path)? {
+                    let entry = entry?;
+                    let path = entry.path();
+                    if path.is_dir() {
+                        fs::remove_dir_all(&path)?;
+                    } else {
+                        fs::remove_file(&path)?;
+                    }
+                }
+            } else {
+                bail!(
+                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
+                    base_path.display()
+                );
+            }
+        }
+
        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
@@ -392,7 +410,9 @@ impl LocalEnv {
            }
        }

-        fs::create_dir(base_path)?;
+        if !base_path.exists() {
+            fs::create_dir(base_path)?;
+        }

        // Generate keypair for JWT.
        //
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
    pub basebackup_ms: u64,
+    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -23,6 +23,7 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+pub mod metric_vec_duration;

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -0,0 +1,23 @@
+//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
+
+use std::{future::Future, time::Instant};
+
+pub trait DurationResultObserver {
+    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
+}
+
+pub async fn observe_async_block_duration_by_result<
+    T,
+    E,
+    F: Future<Output = Result<T, E>>,
+    O: DurationResultObserver,
+>(
+    observer: &O,
+    block: F,
+) -> Result<T, E> {
+    let start = Instant::now();
+    let result = block.await;
+    let duration = start.elapsed();
+    observer.observe_result(&result, duration);
+    result
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,6 +70,14 @@ impl RemotePath {
    pub fn join(&self, segment: &Path) -> Self {
        Self(self.0.join(segment))
    }
+
+    pub fn get_path(&self) -> &PathBuf {
+        &self.0
+    }
+
+    pub fn extension(&self) -> Option<&str> {
+        self.0.extension()?.to_str()
+    }
 }

 /// Storage (potentially remote) API to manage its state.
@@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError>;

+    /// Lists all files in directory "recursively"
+    /// (not really recursively, because AWS has a flat namespace)
+    /// Note: This is subtely different than list_prefixes,
+    /// because it is for listing files instead of listing
+    /// names sharing common prefixes.
+    /// For example,
+    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
+    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
+    /// whereas,
+    /// list_prefixes("foo/bar/") = ["cat", "dog"]
+    /// See `test_real_s3.rs` for more details.
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
@@ -174,6 +195,14 @@ impl GenericRemoteStorage {
        }
    }

+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -48,6 +48,14 @@ impl LocalFs {
        Ok(Self { storage_root })
    }

+    // mirrors S3Bucket::s3_object_to_relative_path
+    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
+        let relative_path = key
+            .strip_prefix(&self.storage_root)
+            .expect("relative path must contain storage_root as prefix");
+        RemotePath(relative_path.into())
+    }
+
    async fn read_storage_metadata(
        &self,
        file_path: &Path,
@@ -132,6 +140,34 @@ impl RemoteStorage for LocalFs {
        Ok(prefixes)
    }

+    // recursively lists all files in a directory,
+    // mirroring the `list_files` for `s3_bucket`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let full_path = match folder {
+            Some(folder) => folder.with_base(&self.storage_root),
+            None => self.storage_root.clone(),
+        };
+        let mut files = vec![];
+        let mut directory_queue = vec![full_path.clone()];
+
+        while !directory_queue.is_empty() {
+            let cur_folder = directory_queue
+                .pop()
+                .expect("queue cannot be empty: we just checked");
+            let mut entries = fs::read_dir(cur_folder.clone()).await?;
+            while let Some(entry) = entries.next_entry().await? {
+                let file_name: PathBuf = entry.file_name().into();
+                let full_file_name = cur_folder.clone().join(&file_name);
+                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
+                files.push(file_remote_path.clone());
+                if full_file_name.is_dir() {
+                    directory_queue.push(full_file_name);
+                }
+            }
+        }
+        Ok(files)
+    }
+
    async fn upload(
        &self,
        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -347,6 +347,51 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }

+    /// See the doc for `RemoteStorage::list_files`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let folder_name = folder
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        // AWS may need to break the response into several parts
+        let mut continuation_token = None;
+        let mut all_files = vec![];
+        loop {
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
+            metrics::inc_list_objects();
+
+            let response = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(folder_name.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(self.max_keys_per_list_response)
+                .send()
+                .await
+                .map_err(|e| {
+                    metrics::inc_list_objects_fail();
+                    e
+                })
+                .context("Failed to list files in S3 bucket")?;
+
+            for object in response.contents().unwrap_or_default() {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                all_files.push(remote_path);
+            }
+            match response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+        Ok(all_files)
+    }
+
    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -83,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_prefixes(prefix).await
    }

+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
+        self.inner.list_files(folder).await
+    }
+
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -88,6 +88,58 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
    Ok(())
 }

+/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `s3_pagination_should_work` for more information.
+///
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
+#[tokio::test]
+async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path().to_str().expect("must be valid name"))
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
@@ -121,10 +173,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;

+    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
+        .with_context(|| "RemotePath conversion")?;
+
    let data1 = "remote blob data1".as_bytes();
    let data1_len = data1.len();
    let data2 = "remote blob data2".as_bytes();
    let data2_len = data2.len();
+    let data3 = "remote blob data3".as_bytes();
+    let data3_len = data3.len();
    ctx.client
        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
        .await?;
@@ -133,8 +190,18 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
        .await?;

+    ctx.client
+        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+        .await?;
+
    ctx.client.delete_objects(&[path1, path2]).await?;

+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
    Ok(())
 }

@@ -248,6 +315,66 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
+enum MaybeEnabledS3WithSimpleTestBlobs {
+    Enabled(S3WithSimpleTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
+}
+struct S3WithSimpleTestBlobs {
+    enabled: EnabledS3,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(S3WithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
+                S3WithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
@@ -258,7 +385,7 @@ fn create_s3_client(
    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
-        .as_millis();
+        .as_nanos();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
@@ -364,3 +491,52 @@ async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<
        }
    }
 }
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_s3_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} S3 files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(&blob_path)
+                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,5 +1,6 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
+use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::error;

@@ -15,7 +16,7 @@ pub enum ApiError {
    Unauthorized(String),

    #[error("NotFound: {0}")]
-    NotFound(anyhow::Error),
+    NotFound(Box<dyn StdError + Send + Sync + 'static>),

    #[error("Conflict: {0}")]
    Conflict(String),
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -84,7 +84,7 @@ pub fn init(
    let r = r.with({
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
-            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_ansi(false)
            .with_writer(std::io::stdout);
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,22 +1,23 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
+use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::Instant;
+use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

 use criterion::{black_box, criterion_group, criterion_main, Criterion};

-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
-    let mut layer_map = LayerMap::<LayerDescriptor>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
+    let mut layer_map = LayerMap::default();

    let mut min_lsn = Lsn(u64::MAX);
    let mut max_lsn = Lsn(0);
@@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        updates.insert_historic(layer.layer_desc().clone());
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
 }

 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
    // For each image layer we query one of the pages contained, at LSN right
    // before the image layer was created. This gives us a somewhat uniform
    // coverage of both the lsn and key space because image layers have
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn

 // Construct a partitioning for testing get_difficulty map when we
 // don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
    let mut parts = Vec::new();

    // We add a partition boundary at the start of each image layer,
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor {
-            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(i)..Lsn(i + 1),
-            is_incremental: false,
-            short_id: format!("Layer {}", i),
-        };
-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
+            TenantId::generate(),
+            TimelineId::generate(),
+            zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            Lsn(i),
+            false,
+            0,
+        ));
+        updates.insert_historic(layer.layer_desc().clone());
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -495,50 +495,50 @@ fn start_pageserver(
                Ok(())
            },
        );
+    }

-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let background_jobs_barrier = background_jobs_barrier;
-            let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
-            );
-            task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
-                None,
-                None,
-                "consumption metrics collection",
-                true,
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let background_jobs_barrier = background_jobs_barrier;
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();

-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };

-                    pageserver::consumption_metrics::collect_metrics(
-                        metric_collection_endpoint,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                },
-            );
-        }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
+            },
+        );
    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -96,12 +96,12 @@ pub mod defaults {

 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'

-# [tenant_config]
+[tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
+#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}

 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -111,7 +111,8 @@ pub mod defaults {
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false
-# [remote_storage]
+
+[remote_storage]

 "###
    );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,6 +24,8 @@ const RESIDENT_SIZE: &str = "resident_size";
 const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";

+const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+
 #[serde_as]
 #[derive(Serialize, Debug)]
 struct Ids {
@@ -73,7 +75,10 @@ pub async fn collect_metrics(
    );

    // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let client = reqwest::ClientBuilder::new()
+        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
+        .build()
+        .expect("Failed to create http client with timeout");
    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

@@ -83,7 +88,7 @@ pub async fn collect_metrics(
                info!("collect_metrics received cancellation request");
                return Ok(());
            },
-            _ = ticker.tick() => {
+            tick_at = ticker.tick() => {

                // send cached metrics every cached_metric_collection_interval
                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
@@ -93,6 +98,12 @@ pub async fn collect_metrics(
                }

                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    metric_collection_interval,
+                    "consumption_metrics_collect_metrics",
+                );
            }
        }
    }
@@ -273,31 +284,42 @@ pub async fn collect_metrics_iteration(
        })
        .expect("PageserverConsumptionMetric should not fail serialization");

-        let res = client
-            .post(metric_collection_endpoint.clone())
-            .json(&chunk_json)
-            .send()
-            .await;
+        const MAX_RETRIES: u32 = 3;

-        match res {
-            Ok(res) => {
-                if res.status().is_success() {
-                    // update cached metrics after they were sent successfully
-                    for (curr_key, curr_val) in chunk.iter() {
-                        cached_metrics.insert(curr_key.clone(), *curr_val);
-                    }
-                } else {
-                    error!("metrics endpoint refused the sent metrics: {:?}", res);
-                    for metric in chunk_to_send.iter() {
-                        // Report if the metric value is suspiciously large
-                        if metric.value > (1u64 << 40) {
+        for attempt in 0..MAX_RETRIES {
+            let res = client
+                .post(metric_collection_endpoint.clone())
+                .json(&chunk_json)
+                .send()
+                .await;
+
+            match res {
+                Ok(res) => {
+                    if res.status().is_success() {
+                        // update cached metrics after they were sent successfully
+                        for (curr_key, curr_val) in chunk.iter() {
+                            cached_metrics.insert(curr_key.clone(), *curr_val);
+                        }
+                    } else {
+                        error!("metrics endpoint refused the sent metrics: {:?}", res);
+                        for metric in chunk_to_send
+                            .iter()
+                            .filter(|metric| metric.value > (1u64 << 40))
+                        {
+                            // Report if the metric value is suspiciously large
                            error!("potentially abnormal metric value: {:?}", metric);
                        }
                    }
+                    break;
+                }
+                Err(err) if err.is_timeout() => {
+                    error!(attempt, "timeout sending metrics, retrying immediately");
+                    continue;
+                }
+                Err(err) => {
+                    error!(attempt, ?err, "failed to send metrics");
+                    break;
                }
-            }
-            Err(err) => {
-                error!("failed to send metrics: {:?}", err);
            }
        }
    }
@@ -317,7 +339,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        _ = ticker.tick() => {
+        tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -343,6 +365,12 @@ pub async fn calculate_synthetic_size_worker(
                    }

                }
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    synthetic_size_calculation_interval,
+                    "consumption_metrics_synthetic_size_worker",
+                );
            }
        }
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -110,7 +110,6 @@ pub fn launch_disk_usage_global_eviction_task(

            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
-            info!("disk usage based eviction task finishing");
            Ok(())
        },
    );
@@ -126,13 +125,16 @@ async fn disk_usage_eviction_task(
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
+    scopeguard::defer! {
+        info!("disk usage based eviction task finishing");
+    };
+
    use crate::tenant::tasks::random_init_delay;
    {
        if random_init_delay(task_config.period, &cancel)
            .await
            .is_err()
        {
-            info!("shutting down");
            return;
        }
    }
@@ -167,7 +169,6 @@ async fn disk_usage_eviction_task(
        tokio::select! {
            _ = tokio::time::sleep_until(sleep_until) => {},
            _ = cancel.cancelled() => {
-                info!("shutting down");
                break
            }
        }
@@ -314,7 +315,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            partition,
            candidate.layer.get_tenant_id(),
            candidate.layer.get_timeline_id(),
-            candidate.layer.filename().file_name(),
+            candidate.layer,
        );
    }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -186,10 +186,8 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
    delete:
-      description: "Attempts to delete specified timeline. On 500 errors should be retried"
+      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
-        "200":
-          description: Ok
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
@@ -214,6 +212,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
        "412":
          description: Tenant is missing, or timeline has children
          content:
@@ -718,6 +722,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
+        "406":
+          description: Permanently unsatisfiable request, don't retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
        "409":
          description: Timeline already exists, creation skipped
          content:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -23,7 +23,6 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::disk_usage_eviction_task;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -35,6 +34,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
+use crate::{disk_usage_eviction_task, tenant};
 use utils::{
    auth::JwtAuth,
    http::{
@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            e @ GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
            SetNewTenantConfigError::GetTenant(tid) => {
-                ApiError::NotFound(anyhow!("tenant {}", tid))
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
            }
            e @ SetNewTenantConfigError::Persist(_) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
@@ -182,11 +182,12 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
            HasChildren(children) => ApiError::PreconditionFailed(
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -327,15 +328,22 @@ async fn timeline_create_handler(
            &ctx,
        )
        .await {
-            Ok(Some(new_timeline)) => {
+            Ok(new_timeline) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
-            Err(err) => Err(ApiError::InternalServerError(err)),
+            Err(tenant::CreateTimelineError::AlreadyExists) => {
+                json_response(StatusCode::CONFLICT, ())
+            }
+            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
+                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
+                    format!("{err:#}")
+                ))
+            }
+            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
@@ -397,7 +405,7 @@ async fn timeline_detail_handler(

        let timeline = tenant
            .get_timeline(timeline_id, false)
-            .map_err(ApiError::NotFound)?;
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        let timeline_info = build_timeline_info(
            &timeline,
@@ -1061,7 +1069,7 @@ async fn timeline_download_remote_layers_handler_get(
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    json_response(StatusCode::OK, info)
 }

@@ -1072,7 +1080,7 @@ async fn active_timeline_of_active_tenant(
    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)
+        .map_err(|e| ApiError::NotFound(e.into()))
 }

 async fn always_panic_handler(
@@ -1128,8 +1136,6 @@ async fn disk_usage_eviction_run(
        freed_bytes: 0,
    };

-    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
-
    let (tx, rx) = tokio::sync::oneshot::channel();

    let state = get_state(&r);
@@ -1147,7 +1153,7 @@ async fn disk_usage_eviction_run(
    let _g = cancel.drop_guard();

    crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,8 +1,9 @@
+use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
-    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
-    UIntGauge, UIntGaugeVec,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
+    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
@@ -129,6 +130,122 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub struct PageCacheMetrics {
+    pub read_accesses_materialized_page: IntCounter,
+    pub read_accesses_ephemeral: IntCounter,
+    pub read_accesses_immutable: IntCounter,
+
+    pub read_hits_ephemeral: IntCounter,
+    pub read_hits_immutable: IntCounter,
+    pub read_hits_materialized_page_exact: IntCounter,
+    pub read_hits_materialized_page_older_lsn: IntCounter,
+}
+
+static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_cache_read_hits_total",
+        "Number of read accesses to the page cache that hit",
+        &["key_kind", "hit_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_cache_read_accesses_total",
+        "Number of read accesses to the page cache",
+        &["key_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+    read_accesses_materialized_page: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+
+    read_accesses_ephemeral: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+
+    read_accesses_immutable: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+
+    read_hits_ephemeral: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["ephemeral", "-"])
+            .unwrap()
+    },
+
+    read_hits_immutable: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["immutable", "-"])
+            .unwrap()
+    },
+
+    read_hits_materialized_page_exact: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["materialized_page", "exact"])
+            .unwrap()
+    },
+
+    read_hits_materialized_page_older_lsn: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
+            .unwrap()
+    },
+});
+
+pub struct PageCacheSizeMetrics {
+    pub max_bytes: UIntGauge,
+
+    pub current_bytes_ephemeral: UIntGauge,
+    pub current_bytes_immutable: UIntGauge,
+    pub current_bytes_materialized_page: UIntGauge,
+}
+
+static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_page_cache_size_current_bytes",
+        "Current size of the page cache in bytes, by key kind",
+        &["key_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
+    max_bytes: {
+        register_uint_gauge!(
+            "pageserver_page_cache_size_max_bytes",
+            "Maximum size of the page cache in bytes"
+        )
+        .expect("failed to define a metric")
+    },
+
+    current_bytes_ephemeral: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+    current_bytes_immutable: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+    current_bytes_materialized_page: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+});
+
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
@@ -203,11 +320,11 @@ pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {

 pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
-        "pageserver_tenant_synthetic_size",
-        "Synthetic size of each tenant",
+        "pageserver_tenant_synthetic_cached_size_bytes",
+        "Synthetic size of each tenant in bytes",
        &["tenant_id"]
    )
-    .expect("Failed to register pageserver_tenant_synthetic_size metric")
+    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
@@ -424,6 +541,27 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub struct BasebackupQueryTime(HistogramVec);
+pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+    BasebackupQueryTime({
+        register_histogram_vec!(
+            "pageserver_basebackup_query_seconds",
+            "Histogram of basebackup queries durations, by result type",
+            &["result"],
+            CRITICAL_OP_BUCKETS.into(),
+        )
+        .expect("failed to define a metric")
+    })
+});
+
+impl DurationResultObserver for BasebackupQueryTime {
+    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+        let label_value = if res.is_ok() { "ok" } else { "error" };
+        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
+        metric.observe(duration.as_secs_f64());
+    }
+}
+
 pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
@@ -823,11 +961,6 @@ impl TimelineMetrics {
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

-        // TODO(chi): remove this once we remove Lazy for all metrics. Otherwise this will not appear in the exporter
-        // and integration test will error.
-        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-        MATERIALIZED_PAGE_CACHE_HIT.get();
-
        TimelineMetrics {
            tenant_id,
            timeline_id,
@@ -951,7 +1084,6 @@ impl RemoteTimelineClientMetrics {
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
-        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
        let metric = guard.entry(key).or_insert_with(move || {
@@ -973,7 +1105,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntGauge {
-        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -994,7 +1125,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
-        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1015,7 +1145,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
-        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_started_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1036,7 +1165,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
-        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_finished_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1302,4 +1430,8 @@ pub fn preinitialize_metrics() {

    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+
+    // Python tests need these.
+    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,8 +53,8 @@ use utils::{
    lsn::Lsn,
 };

-use crate::repository::Key;
 use crate::tenant::writeback_ephemeral_file;
+use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -187,6 +187,8 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
+
+    size_metrics: &'static PageCacheSizeMetrics,
 }

 ///
@@ -313,6 +315,10 @@ impl PageCache {
        key: &Key,
        lsn: Lsn,
    ) -> Option<(Lsn, PageReadGuard)> {
+        crate::metrics::PAGE_CACHE
+            .read_accesses_materialized_page
+            .inc();
+
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
@@ -323,8 +329,21 @@ impl PageCache {
        };

        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
-                Some((lsn, guard))
+            if let CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: available_lsn,
+            } = cache_key
+            {
+                if available_lsn == lsn {
+                    crate::metrics::PAGE_CACHE
+                        .read_hits_materialized_page_exact
+                        .inc();
+                } else {
+                    crate::metrics::PAGE_CACHE
+                        .read_hits_materialized_page_older_lsn
+                        .inc();
+                }
+                Some((available_lsn, guard))
            } else {
                panic!("unexpected key type in slot");
            }
@@ -499,11 +518,31 @@ impl PageCache {
    /// ```
    ///
    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+        let (read_access, hit) = match cache_key {
+            CacheKey::MaterializedPage { .. } => {
+                unreachable!("Materialized pages use lookup_materialized_page")
+            }
+            CacheKey::EphemeralPage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
+                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
+            ),
+            CacheKey::ImmutableFilePage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.read_hits_immutable,
+            ),
+        };
+        read_access.inc();
+
+        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+                if is_first_iteration {
+                    hit.inc();
+                }
                return Ok(ReadBufResult::Found(read_guard));
            }
+            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) =
@@ -681,6 +720,9 @@ impl PageCache {

                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
                        versions.remove(version_idx);
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .sub_page_sz(1);
                        if versions.is_empty() {
                            old_entry.remove_entry();
                        }
@@ -693,11 +735,13 @@ impl PageCache {
                let mut map = self.ephemeral_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
            }
        }
    }
@@ -725,6 +769,9 @@ impl PageCache {
                                slot_idx,
                            },
                        );
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .add_page_sz(1);
                        None
                    }
                }
@@ -735,6 +782,7 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
                        None
                    }
                }
@@ -745,6 +793,7 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
                        None
                    }
                }
@@ -844,6 +893,12 @@ impl PageCache {

        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

+        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
+        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_ephemeral.set_page_sz(0);
+        size_metrics.current_bytes_immutable.set_page_sz(0);
+        size_metrics.current_bytes_materialized_page.set_page_sz(0);
+
        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
            .map(|chunk| {
@@ -866,6 +921,30 @@ impl PageCache {
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
+            size_metrics,
        }
    }
 }
+
+trait PageSzBytesMetric {
+    fn set_page_sz(&self, count: usize);
+    fn add_page_sz(&self, count: usize);
+    fn sub_page_sz(&self, count: usize);
+}
+
+#[inline(always)]
+fn count_times_page_sz(count: usize) -> u64 {
+    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
+}
+
+impl PageSzBytesMetric for metrics::UIntGauge {
+    fn set_page_sz(&self, count: usize) {
+        self.set(count_times_page_sz(count));
+    }
+    fn add_page_sz(&self, count: usize) {
+        self.add(count_times_page_sz(count));
+    }
+    fn sub_page_sz(&self, count: usize) {
+        self.sub(count_times_page_sz(count));
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -390,7 +390,9 @@ impl PageServerHandler {
        };

        // Check that the timeline exists
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
@@ -902,7 +904,7 @@ where

            self.check_permission(Some(tenant_id))?;

-            let lsn = if params.len() == 3 {
+            let lsn = if params.len() >= 3 {
                Some(
                    Lsn::from_str(params[2])
                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
@@ -911,10 +913,24 @@ where
                None
            };

-            // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
-                .await?;
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            metrics::metric_vec_duration::observe_async_block_duration_by_result(
+                &*crate::metrics::BASEBACKUP_QUERY_TIME,
+                async move {
+                    self.handle_basebackup_request(
+                        pgb,
+                        tenant_id,
+                        timeline_id,
+                        lsn,
+                        None,
+                        false,
+                        ctx,
+                    )
+                    .await?;
+                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                    anyhow::Ok(())
+                },
+            )
+            .await?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1230,6 +1246,6 @@ async fn get_active_tenant_timeline(
        .map_err(GetActiveTimelineError::Tenant)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(GetActiveTimelineError::Timeline)?;
+        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
    Ok(timeline)
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -887,7 +887,7 @@ impl<'a> DatadirModification<'a> {
        ctx: &RequestContext,
    ) -> Result<(), RelationError> {
        if rel.relnode == 0 {
-            return Err(RelationError::AlreadyExists);
+            return Err(RelationError::InvalidRelnode);
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -506,17 +506,17 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            let completed = tokio::select! {
+            let join_handle = tokio::select! {
                biased;
-                _ = &mut join_handle => { true },
+                _ = &mut join_handle => { None },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
                    // lines.
                    info!("waiting for {} to shut down", task.name);
-                    false
+                    Some(join_handle)
                }
            };
-            if !completed {
+            if let Some(join_handle) = join_handle {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -421,12 +421,32 @@ remote:
    }
 }

+#[derive(Debug, thiserror::Error, PartialEq, Eq)]
+pub enum GetTimelineError {
+    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
+    NotActive {
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        state: TimelineState,
+    },
+    #[error("Timeline {tenant_id}/{timeline_id} was not found")]
+    NotFound {
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    },
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,
+
    #[error("HasChildren")]
    HasChildren(Vec<TimelineId>),
+
+    #[error("Timeline deletion is already in progress")]
+    AlreadyInProgress,
+
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -481,6 +501,16 @@ impl DeletionGuard {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub enum CreateTimelineError {
+    #[error("a timeline with the given ID already exists")]
+    AlreadyExists,
+    #[error(transparent)]
+    AncestorLsn(anyhow::Error),
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -570,6 +600,7 @@ impl Tenant {
                    .layers
                    .read()
                    .await
+                    .0
                    .iter_historic_layers()
                    .next()
                    .is_some(),
@@ -946,6 +977,117 @@ impl Tenant {
        tenant
    }

+    pub fn scan_and_sort_timelines_dir(
+        self: Arc<Tenant>,
+    ) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
+        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
+        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
+
+        for entry in
+            std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
+        {
+            let entry = entry.context("read timeline dir entry")?;
+            let timeline_dir = entry.path();
+
+            if crate::is_temporary(&timeline_dir) {
+                info!(
+                    "Found temporary timeline directory, removing: {}",
+                    timeline_dir.display()
+                );
+                if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
+                    error!(
+                        "Failed to remove temporary directory '{}': {:?}",
+                        timeline_dir.display(),
+                        e
+                    );
+                }
+            } else if is_uninit_mark(&timeline_dir) {
+                if !timeline_dir.exists() {
+                    warn!(
+                        "Timeline dir entry become invalid: {}",
+                        timeline_dir.display()
+                    );
+                    continue;
+                }
+                let timeline_uninit_mark_file = &timeline_dir;
+                info!(
+                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                    timeline_uninit_mark_file.display()
+                );
+                let timeline_id = timeline_uninit_mark_file
+                    .file_stem()
+                    .and_then(OsStr::to_str)
+                    .unwrap_or_default()
+                    .parse::<TimelineId>()
+                    .with_context(|| {
+                        format!(
+                            "Could not parse timeline id out of the timeline uninit mark name {}",
+                            timeline_uninit_mark_file.display()
+                        )
+                    })?;
+                let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+                if let Err(e) =
+                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
+                {
+                    error!("Failed to clean up uninit marked timeline: {e:?}");
+                }
+            } else {
+                if !timeline_dir.exists() {
+                    warn!(
+                        "Timeline dir entry become invalid: {}",
+                        timeline_dir.display()
+                    );
+                    continue;
+                }
+                let timeline_id = timeline_dir
+                    .file_name()
+                    .and_then(OsStr::to_str)
+                    .unwrap_or_default()
+                    .parse::<TimelineId>()
+                    .with_context(|| {
+                        format!(
+                            "Could not parse timeline id out of the timeline dir name {}",
+                            timeline_dir.display()
+                        )
+                    })?;
+                let timeline_uninit_mark_file = self
+                    .conf
+                    .timeline_uninit_mark_file_path(self.tenant_id, timeline_id);
+                if timeline_uninit_mark_file.exists() {
+                    info!(
+                        %timeline_id,
+                        "Found an uninit mark file, removing the timeline and its uninit mark",
+                    );
+                    if let Err(e) =
+                        remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file)
+                    {
+                        error!("Failed to clean up uninit marked timeline: {e:?}");
+                    }
+                    continue;
+                }
+
+                let file_name = entry.file_name();
+                if let Ok(timeline_id) =
+                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
+                {
+                    let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
+                        .context("failed to load metadata")?;
+                    timelines_to_load.insert(timeline_id, metadata);
+                } else {
+                    // A file or directory that doesn't look like a timeline ID
+                    warn!(
+                        "unexpected file or directory in timelines directory: {}",
+                        file_name.to_string_lossy()
+                    );
+                }
+            }
+        }
+
+        // Sort the array of timeline IDs into tree-order, so that parent comes before
+        // all its children.
+        tree_sort_timelines(timelines_to_load)
+    }
+
    ///
    /// Background task to load in-memory data structures for this tenant, from
    /// files on disk. Used at pageserver startup.
@@ -962,110 +1104,16 @@ impl Tenant {

        utils::failpoint_sleep_millis_async!("before-loading-tenant");

-        // TODO split this into two functions, scan and actual load
-
        // Load in-memory state to reflect the local files on disk
        //
        // Scan the directory, peek into the metadata file of each timeline, and
        // collect a list of timelines and their ancestors.
-        let tenant_id = self.tenant_id;
-        let conf = self.conf;
        let span = info_span!("blocking");
+        let cloned = Arc::clone(self);

        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
            let _g = span.entered();
-            let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-            let timelines_dir = conf.timelines_path(&tenant_id);
-
-            for entry in
-                std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
-            {
-                let entry = entry.context("read timeline dir entry")?;
-                let timeline_dir = entry.path();
-
-                if crate::is_temporary(&timeline_dir) {
-                    info!(
-                        "Found temporary timeline directory, removing: {}",
-                        timeline_dir.display()
-                    );
-                    if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
-                        error!(
-                            "Failed to remove temporary directory '{}': {:?}",
-                            timeline_dir.display(),
-                            e
-                        );
-                    }
-                } else if is_uninit_mark(&timeline_dir) {
-                    let timeline_uninit_mark_file = &timeline_dir;
-                    info!(
-                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
-                        timeline_uninit_mark_file.display()
-                    );
-                    let timeline_id = timeline_uninit_mark_file
-                        .file_stem()
-                        .and_then(OsStr::to_str)
-                        .unwrap_or_default()
-                        .parse::<TimelineId>()
-                        .with_context(|| {
-                            format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {}",
-                            timeline_uninit_mark_file.display()
-                        )
-                        })?;
-                    let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id);
-                    if let Err(e) =
-                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
-                    {
-                        error!("Failed to clean up uninit marked timeline: {e:?}");
-                    }
-                } else {
-                    let timeline_id = timeline_dir
-                        .file_name()
-                        .and_then(OsStr::to_str)
-                        .unwrap_or_default()
-                        .parse::<TimelineId>()
-                        .with_context(|| {
-                            format!(
-                                "Could not parse timeline id out of the timeline dir name {}",
-                                timeline_dir.display()
-                            )
-                        })?;
-                    let timeline_uninit_mark_file =
-                        conf.timeline_uninit_mark_file_path(tenant_id, timeline_id);
-                    if timeline_uninit_mark_file.exists() {
-                        info!(
-                            %timeline_id,
-                            "Found an uninit mark file, removing the timeline and its uninit mark",
-                        );
-                        if let Err(e) = remove_timeline_and_uninit_mark(
-                            &timeline_dir,
-                            &timeline_uninit_mark_file,
-                        ) {
-                            error!("Failed to clean up uninit marked timeline: {e:?}");
-                        }
-                        continue;
-                    }
-
-                    let file_name = entry.file_name();
-                    if let Ok(timeline_id) =
-                        file_name.to_str().unwrap_or_default().parse::<TimelineId>()
-                    {
-                        let metadata = load_metadata(conf, timeline_id, tenant_id)
-                            .context("failed to load metadata")?;
-                        timelines_to_load.insert(timeline_id, metadata);
-                    } else {
-                        // A file or directory that doesn't look like a timeline ID
-                        warn!(
-                            "unexpected file or directory in timelines directory: {}",
-                            file_name.to_string_lossy()
-                        );
-                    }
-                }
-            }
-
-            // Sort the array of timeline IDs into tree-order, so that parent comes before
-            // all its children.
-            tree_sort_timelines(timelines_to_load)
+            cloned.scan_and_sort_timelines_dir()
        })
        .await
        .context("load spawn_blocking")
@@ -1213,19 +1261,21 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        active_only: bool,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, GetTimelineError> {
        let timelines_accessor = self.timelines.lock().unwrap();
-        let timeline = timelines_accessor.get(&timeline_id).with_context(|| {
-            format!("Timeline {}/{} was not found", self.tenant_id, timeline_id)
-        })?;
+        let timeline = timelines_accessor
+            .get(&timeline_id)
+            .ok_or(GetTimelineError::NotFound {
+                tenant_id: self.tenant_id,
+                timeline_id,
+            })?;

        if active_only && !timeline.is_active() {
-            anyhow::bail!(
-                "Timeline {}/{} is not active, state: {:?}",
-                self.tenant_id,
+            Err(GetTimelineError::NotActive {
+                tenant_id: self.tenant_id,
                timeline_id,
-                timeline.current_state()
-            )
+                state: timeline.current_state(),
+            })
        } else {
            Ok(Arc::clone(timeline))
        }
@@ -1335,8 +1385,7 @@ impl Tenant {
    /// Returns the new timeline ID and reference to its Timeline object.
    ///
    /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
-    /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
-    /// a new unique ID is generated.
+    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    pub async fn create_timeline(
        &self,
        new_timeline_id: TimelineId,
@@ -1345,11 +1394,12 @@ impl Tenant {
        pg_version: u32,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Arc<Timeline>>> {
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot create timelines on inactive tenant"
-        );
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
+        if !self.is_active() {
+            return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                "Cannot create timelines on inactive tenant"
+            )));
+        }

        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");
@@ -1369,7 +1419,7 @@ impl Tenant {
                    .context("wait for timeline uploads to complete")?;
            }

-            return Ok(None);
+            return Err(CreateTimelineError::AlreadyExists);
        }

        let loaded_timeline = match ancestor_timeline_id {
@@ -1384,12 +1434,12 @@ impl Tenant {
                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
                        // can we safely just branch from the ancestor instead?
-                        bail!(
+                        return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                            "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
                            lsn,
                            ancestor_timeline_id,
                            ancestor_ancestor_lsn,
-                        );
+                        )));
                    }

                    // Wait for the WAL to arrive and be processed on the parent branch up
@@ -1423,7 +1473,7 @@ impl Tenant {
            })?;
        }

-        Ok(Some(loaded_timeline))
+        Ok(loaded_timeline)
    }

    /// perform one garbage collection iteration, removing old data files from disk.
@@ -1721,14 +1771,11 @@ impl Tenant {
            timeline = Arc::clone(timeline_entry.get());

            // Prevent two tasks from trying to delete the timeline at the same time.
-            delete_lock_guard =
-                DeletionGuard(Arc::clone(&timeline.delete_lock).try_lock_owned().map_err(
-                    |_| {
-                        DeleteTimelineError::Other(anyhow::anyhow!(
-                            "timeline deletion is already in progress"
-                        ))
-                    },
-                )?);
+            delete_lock_guard = DeletionGuard(
+                Arc::clone(&timeline.delete_lock)
+                    .try_lock_owned()
+                    .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+            );

            // If another task finished the deletion just before we acquired the lock,
            // return success.
@@ -2670,7 +2717,7 @@ impl Tenant {
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let tl = self
            .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
            .await?;
@@ -2687,7 +2734,7 @@ impl Tenant {
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
            .await
    }
@@ -2698,7 +2745,7 @@ impl Tenant {
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
        _ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

        // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2738,16 +2785,17 @@ impl Tenant {
            .context(format!(
                "invalid branch start lsn: less than latest GC cutoff {}",
                *latest_gc_cutoff_lsn,
-            ))?;
+            ))
+            .map_err(CreateTimelineError::AncestorLsn)?;

        // and then the planned GC cutoff
        {
            let gc_info = src_timeline.gc_info.read().unwrap();
            let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
            if start_lsn < cutoff {
-                bail!(format!(
+                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                ));
+                )));
            }
        }

@@ -3375,9 +3423,8 @@ where
 #[cfg(test)]
 pub mod harness {
    use bytes::{Bytes, BytesMut};
-    use once_cell::sync::Lazy;
    use once_cell::sync::OnceCell;
-    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
+    use std::sync::Arc;
    use std::{fs, path::PathBuf};
    use utils::logging;
    use utils::lsn::Lsn;
@@ -3410,8 +3457,6 @@ pub mod harness {
        buf.freeze()
    }

-    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
-
    impl From<TenantConf> for TenantConfOpt {
        fn from(tenant_conf: TenantConf) -> Self {
            Self {
@@ -3438,33 +3483,16 @@ pub mod harness {
        }
    }

-    pub struct TenantHarness<'a> {
+    pub struct TenantHarness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
        pub tenant_id: TenantId,
-
-        pub lock_guard: (
-            Option<RwLockReadGuard<'a, ()>>,
-            Option<RwLockWriteGuard<'a, ()>>,
-        ),
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();

-    impl<'a> TenantHarness<'a> {
+    impl TenantHarness {
        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, false)
-        }
-        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, true)
-        }
-        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
-            let lock_guard = if exclusive {
-                (None, Some(LOCK.write().unwrap()))
-            } else {
-                (Some(LOCK.read().unwrap()), None)
-            };
-
            LOG_HANDLE.get_or_init(|| {
                logging::init(
                    logging::LogFormat::Test,
@@ -3500,7 +3528,6 @@ pub mod harness {
                conf,
                tenant_conf,
                tenant_id,
-                lock_guard,
            })
        }

@@ -3525,26 +3552,12 @@ pub mod harness {
                self.tenant_id,
                None,
            ));
-            // populate tenant with locally available timelines
-            let mut timelines_to_load = HashMap::new();
-            for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
-                .expect("should be able to read timelines dir")
-            {
-                let timeline_dir_entry = timeline_dir_entry?;
-                let timeline_id: TimelineId = timeline_dir_entry
-                    .path()
-                    .file_name()
-                    .unwrap()
-                    .to_string_lossy()
-                    .parse()?;
-
-                let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?;
-                timelines_to_load.insert(timeline_id, timeline_metadata);
-            }
            tenant
                .load(None, ctx)
                .instrument(info_span!("try_load", tenant_id=%self.tenant_id))
                .await?;
+
+            // TODO reuse Tenant::activate (needs broker)
            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
                timeline.set_state(TimelineState::Active);
@@ -3815,6 +3828,9 @@ mod tests {
        {
            Ok(_) => panic!("branching should have failed"),
            Err(err) => {
+                let CreateTimelineError::AncestorLsn(err) = err else {
+                    panic!("wrong error type")
+                };
                assert!(err.to_string().contains("invalid branch start lsn"));
                assert!(err
                    .source()
@@ -3844,6 +3860,9 @@ mod tests {
        {
            Ok(_) => panic!("branching should have failed"),
            Err(err) => {
+                let CreateTimelineError::AncestorLsn(err) = err else {
+                    panic!("wrong error type");
+                };
                assert!(&err.to_string().contains("invalid branch start lsn"));
                assert!(&err
                    .source()
@@ -4070,9 +4089,13 @@ mod tests {
        std::fs::write(metadata_path, metadata_bytes)?;

        let err = harness.try_load(&ctx).await.err().expect("should fail");
-        assert!(err
-            .to_string()
-            .starts_with("Failed to parse metadata bytes from path"));
+        // get all the stack with all .context, not tonly the last one
+        let message = format!("{err:#}");
+        let expected = "Failed to parse metadata bytes from path";
+        assert!(
+            message.contains(expected),
+            "message '{message}' expected to contain {expected}"
+        );

        let mut found_error_message = false;
        let mut err_source = err.source();
@@ -4506,6 +4529,44 @@ mod tests {
            assert!(expect_initdb_optimization);
            assert!(initdb_optimization_count > 0);
        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
+        let name = "test_uninit_mark_crash";
+        let harness = TenantHarness::create(name)?;
+        {
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            // Keeps uninit mark in place
+            std::mem::forget(tline);
+        }
+
+        let (tenant, _) = harness.load().await;
+        match tenant.get_timeline(TIMELINE_ID, false) {
+            Ok(_) => panic!("timeline should've been removed during load"),
+            Err(e) => {
+                assert_eq!(
+                    e,
+                    GetTimelineError::NotFound {
+                        tenant_id: tenant.tenant_id,
+                        timeline_id: TIMELINE_ID,
+                    }
+                )
+            }
+        }
+
+        assert!(!harness
+            .conf
+            .timeline_path(&TIMELINE_ID, &tenant.tenant_id)
+            .exists());
+
+        assert!(!harness
+            .conf
+            .timeline_uninit_mark_file_path(tenant.tenant_id, TIMELINE_ID)
+            .exists());

        Ok(())
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::Context;
 use anyhow::Result;
-use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;

 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::Replacement;
+pub use historic_layer_coverage::LayerKey;

 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
-use super::storage_layer::PersistentLayerKey;

 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-pub struct LayerMap<L: ?Sized> {
+#[derive(Default)]
+pub struct LayerMap {
    //
    // 'open_layer' holds the current InMemoryLayer that is accepting new
    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -95,24 +93,6 @@ pub struct LayerMap<L: ?Sized> {
    /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
-
-    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
-    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
-    /// RemoteLayer will be removed.
-    mapping: HashMap<PersistentLayerKey, Arc<L>>,
-}
-
-impl<L: ?Sized> Default for LayerMap<L> {
-    fn default() -> Self {
-        Self {
-            open_layer: None,
-            next_open_layer_at: None,
-            frozen_layers: VecDeque::default(),
-            l0_delta_layers: Vec::default(),
-            historic: BufferedHistoricLayerCoverage::default(),
-            mapping: HashMap::default(),
-        }
-    }
 }

 /// The primary update API for the layer map.
@@ -120,24 +100,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
 /// Batching historic layer insertions and removals is good for
 /// performance and this struct helps us do that correctly.
 #[must_use]
-pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+pub struct BatchedUpdates<'a> {
    // While we hold this exclusive reference to the layer map the type checker
    // will prevent us from accidentally reading any unflushed updates.
-    layer_map: &'a mut LayerMap<L>,
+    layer_map: &'a mut LayerMap,
 }

 /// Provide ability to batch more updates while hiding the read
 /// API so we don't accidentally read without flushing.
-impl<L> BatchedUpdates<'_, L>
-where
-    L: ?Sized + Layer,
-{
+impl BatchedUpdates<'_> {
    ///
    /// Insert an on-disk layer.
    ///
    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.insert_historic_noflush(layer_desc, layer)
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
+        self.layer_map.insert_historic_noflush(layer_desc)
    }

    ///
@@ -145,31 +122,8 @@ where
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.remove_historic_noflush(layer_desc, layer)
-    }
-
-    /// Replaces existing layer iff it is the `expected`.
-    ///
-    /// If the expected layer has been removed it will not be inserted by this function.
-    ///
-    /// Returned `Replacement` describes succeeding in replacement or the reason why it could not
-    /// be done.
-    ///
-    /// TODO replacement can be done without buffering and rebuilding layer map updates.
-    ///      One way to do that is to add a layer of indirection for returned values, so
-    ///      that we can replace values only by updating a hashmap.
-    pub fn replace_historic(
-        &mut self,
-        expected_desc: PersistentLayerDesc,
-        expected: &Arc<L>,
-        new_desc: PersistentLayerDesc,
-        new: Arc<L>,
-    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
-
-        self.layer_map
-            .replace_historic_noflush(expected_desc, expected, new_desc, new)
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
+        self.layer_map.remove_historic_noflush(layer_desc)
    }

    // We will flush on drop anyway, but this method makes it
@@ -185,25 +139,19 @@ where
 // than panic later or read without flushing.
 //
 // TODO maybe warn if flush hasn't explicitly been called
-impl<L> Drop for BatchedUpdates<'_, L>
-where
-    L: ?Sized + Layer,
-{
+impl Drop for BatchedUpdates<'_> {
    fn drop(&mut self) {
        self.layer_map.flush_updates();
    }
 }

 /// Return value of LayerMap::search
-pub struct SearchResult<L: ?Sized> {
-    pub layer: Arc<L>,
+pub struct SearchResult {
+    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }

-impl<L> LayerMap<L>
-where
-    L: ?Sized + Layer,
-{
+impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
    /// 'key', with lsn.start < 'end_lsn'.
@@ -235,7 +183,7 @@ where
    /// NOTE: This only searches the 'historic' layers, *not* the
    /// 'open' and 'frozen' layers!
    ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());
@@ -244,7 +192,6 @@ where
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
-                let image = self.get_layer_from_mapping(&image.key()).clone();
                Some(SearchResult {
                    layer: image,
                    lsn_floor,
@@ -252,7 +199,6 @@ where
            }
            (Some(delta), None) => {
                let lsn_floor = delta.get_lsn_range().start;
-                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                Some(SearchResult {
                    layer: delta,
                    lsn_floor,
@@ -263,7 +209,6 @@ where
                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                let image_exact_match = img_lsn + 1 == end_lsn;
                if image_is_newer || image_exact_match {
-                    let image = self.get_layer_from_mapping(&image.key()).clone();
                    Some(SearchResult {
                        layer: image,
                        lsn_floor: img_lsn,
@@ -271,7 +216,6 @@ where
                } else {
                    let lsn_floor =
                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
-                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                    Some(SearchResult {
                        layer: delta,
                        lsn_floor,
@@ -282,7 +226,7 @@ where
    }

    /// Start a batch of updates, applied on drop
-    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
        BatchedUpdates { layer_map: self }
    }

@@ -292,48 +236,32 @@ where
    /// Helper function for BatchedUpdates::insert_historic
    ///
    /// TODO(chi): remove L generic so that we do not need to pass layer object.
-    pub(self) fn insert_historic_noflush(
-        &mut self,
-        layer_desc: PersistentLayerDesc,
-        layer: Arc<L>,
-    ) {
-        self.mapping.insert(layer_desc.key(), layer.clone());
-
+    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer) {
+        if Self::is_l0(&layer_desc) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&*layer),
+            historic_layer_coverage::LayerKey::from(&layer_desc),
            layer_desc.into(),
        );
    }

-    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
-        let layer = self
-            .mapping
-            .get(key)
-            .with_context(|| format!("{key:?}"))
-            .expect("inconsistent layer mapping");
-        layer
-    }
-
    ///
    /// Remove an on-disk layer from the map.
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&*layer));
-        if Self::is_l0(&layer) {
+            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
+        let layer_key = layer_desc.key();
+        if Self::is_l0(&layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-            l0_delta_layers.retain(|other| {
-                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
-            });
+            l0_delta_layers.retain(|other| other.key() != layer_key);
            self.l0_delta_layers = l0_delta_layers;
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
@@ -344,69 +272,6 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
-        self.mapping.remove(&layer_desc.key());
-    }
-
-    pub(self) fn replace_historic_noflush(
-        &mut self,
-        expected_desc: PersistentLayerDesc,
-        expected: &Arc<L>,
-        new_desc: PersistentLayerDesc,
-        new: Arc<L>,
-    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        let key = historic_layer_coverage::LayerKey::from(&**expected);
-        let other = historic_layer_coverage::LayerKey::from(&*new);
-
-        let expected_l0 = Self::is_l0(expected);
-        let new_l0 = Self::is_l0(&new);
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new must have equal LayerKeys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
-        );
-
-        let l0_index = if expected_l0 {
-            // find the index in case replace worked, we need to replace that as well
-            let pos = self.l0_delta_layers.iter().position(|slot| {
-                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
-            });
-
-            if pos.is_none() {
-                return Ok(Replacement::NotFound);
-            }
-            pos
-        } else {
-            None
-        };
-
-        let new_desc = Arc::new(new_desc);
-        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
-            **existing == expected_desc
-        });
-
-        if let Replacement::Replaced { .. } = &replaced {
-            self.mapping.remove(&expected_desc.key());
-            self.mapping.insert(new_desc.key(), new);
-            if let Some(index) = l0_index {
-                self.l0_delta_layers[index] = new_desc;
-            }
-        }
-
-        let replaced = match replaced {
-            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
-            Replacement::NotFound => Replacement::NotFound,
-            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
-            Replacement::Unexpected(x) => {
-                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
-            }
-        };
-
-        Ok(replaced)
    }

    /// Helper function for BatchedUpdates::drop.
@@ -454,10 +319,8 @@ where
        Ok(true)
    }

-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic
-            .iter()
-            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+        self.historic.iter()
    }

    ///
@@ -472,7 +335,7 @@ where
        &self,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
        let version = match self.historic.get().unwrap().get_version(lsn.0) {
            Some(v) => v,
            None => return Ok(vec![]),
@@ -482,36 +345,26 @@ where
        let end = key_range.end.to_i128();

        // Initialize loop variables
-        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
        let mut current_key = start;
        let mut current_val = version.image_coverage.query(start);

        // Loop through the change events and push intervals
        for (change_key, change_val) in version.image_coverage.range(start..end) {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((
-                kr,
-                current_val
-                    .take()
-                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
-            ));
+            coverage.push((kr, current_val.take()));
            current_key = change_key;
            current_val = change_val.clone();
        }

        // Add the final interval
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((
-            kr,
-            current_val
-                .take()
-                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
-        ));
+        coverage.push((kr, current_val.take()));

        Ok(coverage)
    }

-    pub fn is_l0(layer: &L) -> bool {
+    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
    }

@@ -537,7 +390,7 @@ where
    /// TODO The optimal number should probably be slightly higher than 1, but to
    ///      implement that we need to plumb a lot more context into this function
    ///      than just the current partition_range.
-    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
        if !Self::is_l0(layer) {
            return true;
@@ -595,9 +448,7 @@ where
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count =
-                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
-                                as usize;
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath =
                            self.count_deltas(&kr, &lr, new_limit)?;
@@ -620,9 +471,7 @@ where
                let lr = lsn.start..val.get_lsn_range().start;

                if !kr.is_empty() {
-                    let base_count =
-                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
-                            as usize;
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                    max_stacked_deltas = std::cmp::max(
@@ -772,12 +621,8 @@ where
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
-        Ok(self
-            .l0_delta_layers
-            .iter()
-            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
-            .collect())
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
+        Ok(self.l0_delta_layers.to_vec())
    }

    /// debugging function to print out the contents of the layer map
@@ -802,72 +647,51 @@ where
        println!("End dump LayerMap");
        Ok(())
    }
-
-    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
-    ///
-    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
-    #[inline(always)]
-    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
-        // "dyn Trait" objects are "fat pointers" in that they have two components:
-        // - pointer to the object
-        // - pointer to the vtable
-        //
-        // rust does not provide a guarantee that these vtables are unique, but however
-        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
-        // pointer and the vtable need to be equal.
-        //
-        // See: https://github.com/rust-lang/rust/issues/103763
-        //
-        // A future version of rust will most likely use this form below, where we cast each
-        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
-        // not affect the comparison.
-        //
-        // See: https://github.com/rust-lang/rust/pull/106450
-        let left = Arc::as_ptr(left) as *const ();
-        let right = Arc::as_ptr(right) as *const ();
-
-        left == right
-    }
 }

 #[cfg(test)]
 mod tests {
-    use super::{LayerMap, Replacement};
-    use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+    use super::LayerMap;
+    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
    use std::str::FromStr;
    use std::sync::Arc;

    mod l0_delta_layers_updated {

+        use crate::tenant::{
+            storage_layer::{PersistentLayer, PersistentLayerDesc},
+            timeline::LayerFileManager,
+        };
+
        use super::*;

        #[test]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                true
-            )
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
        }

        #[test]
        fn for_non_full_range_delta() {
            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                // because not full range
-                false
-            )
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
        }

        #[test]
        fn for_image() {
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                // code only checks if it is a full range layer, doesn't care about images, which must
-                // mean we should in practice never have full range images
-                false
-            )
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
        }

        #[test]
@@ -883,16 +707,16 @@ mod tests {
            let not_found = Arc::new(layer.clone());
            let new_version = Arc::new(layer);

-            let mut map = LayerMap::default();
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();

-            let res = map.batch_update().replace_historic(
-                not_found.get_persistent_layer_desc(),
-                &not_found,
-                new_version.get_persistent_layer_desc(),
-                new_version,
-            );
+            let mut mapping = LayerFileManager::new();

-            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
        }

        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
@@ -903,49 +727,44 @@ mod tests {
            let downloaded = Arc::new(skeleton);

            let mut map = LayerMap::default();
+            let mut mapping = LayerFileManager::new();

            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());

            let expected_in_counts = (1, usize::from(expected_l0));

            map.batch_update()
-                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
-            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
-
-            let replaced = map
-                .batch_update()
-                .replace_historic(
-                    remote.get_persistent_layer_desc(),
-                    &remote,
-                    downloaded.get_persistent_layer_desc(),
-                    downloaded.clone(),
-                )
-                .expect("name derived attributes are the same");
-            assert!(
-                matches!(replaced, Replacement::Replaced { .. }),
-                "{replaced:?}"
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
+                .expect("name derived attributes are the same");
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
            );
-            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);

            map.batch_update()
-                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
-            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
+                .remove_historic(downloaded.layer_desc().clone());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }

-        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
+                .filter(|x| x.key() == layer.key())
                .count();
            let l0s = map
                .get_level0_deltas()
                .expect("why does this return a result");
-            let l0 = l0s
-                .iter()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
-                .count();
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();

            (historic, l0)
        }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,6 +3,8 @@ use std::ops::Range;

 use tracing::info;

+use crate::tenant::storage_layer::PersistentLayerDesc;
+
 use super::layer_coverage::LayerCoverageTuple;

 /// Layers in this module are identified and indexed by this data.
@@ -41,8 +43,8 @@ impl Ord for LayerKey {
    }
 }

-impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
-    fn from(layer: &'a L) -> Self {
+impl From<&PersistentLayerDesc> for LayerKey {
+    fn from(layer: &PersistentLayerDesc) -> Self {
        let kr = layer.get_key_range();
        let lr = layer.get_lsn_range();
        LayerKey {
@@ -454,59 +456,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        self.buffer.insert(layer_key, None);
    }

-    /// Replaces a previous layer with a new layer value.
-    ///
-    /// The replacement is conditional on:
-    /// - there is an existing `LayerKey` record
-    /// - there is no buffered removal for the given `LayerKey`
-    /// - the given closure returns true for the current `Value`
-    ///
-    /// The closure is used to compare the latest value (buffered insert, or existing layer)
-    /// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
-    /// inaccessible via `PartialEq` trait.
-    ///
-    /// Returns a `Replacement` value describing the outcome; only the case of
-    /// `Replacement::Replaced` modifies the map and requires a rebuild.
-    pub fn replace<F>(
-        &mut self,
-        layer_key: &LayerKey,
-        new: Value,
-        check_expected: F,
-    ) -> Replacement<Value>
-    where
-        F: FnOnce(&Value) -> bool,
-    {
-        let (slot, in_buffered) = match self.buffer.get(layer_key) {
-            Some(inner @ Some(_)) => {
-                // we compare against the buffered version, because there will be a later
-                // rebuild before querying
-                (inner.as_ref(), true)
-            }
-            Some(None) => {
-                // buffer has removal for this key; it will not be equivalent by any check_expected.
-                return Replacement::RemovalBuffered;
-            }
-            None => {
-                // no pending modification for the key, check layers
-                (self.layers.get(layer_key), false)
-            }
-        };
-
-        match slot {
-            Some(existing) if !check_expected(existing) => {
-                // unfortunate clone here, but otherwise the nll borrowck grows the region of
-                // 'a to cover the whole function, and we could not mutate in the other
-                // Some(existing) branch
-                Replacement::Unexpected(existing.clone())
-            }
-            None => Replacement::NotFound,
-            Some(_existing) => {
-                self.insert(layer_key.to_owned(), new);
-                Replacement::Replaced { in_buffered }
-            }
-        }
-    }
-
    pub fn rebuild(&mut self) {
        // Find the first LSN that needs to be rebuilt
        let rebuild_since: u64 = match self.buffer.iter().next() {
@@ -575,22 +524,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    }
 }

-/// Outcome of the replace operation.
-#[derive(Debug)]
-pub enum Replacement<Value> {
-    /// Previous value was replaced with the new value.
-    Replaced {
-        /// Replacement happened for a scheduled insert.
-        in_buffered: bool,
-    },
-    /// Key was not found buffered updates or existing layers.
-    NotFound,
-    /// Key has been scheduled for removal, it was not replaced.
-    RemovalBuffered,
-    /// Previous value was rejected by the closure.
-    Unexpected(Value),
-}
-
 #[test]
 fn test_retroactive_regression_1() {
    let mut map = BufferedHistoricLayerCoverage::new();
@@ -699,139 +632,3 @@ fn test_retroactive_simple() {
        assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
    }
 }
-
-#[test]
-fn test_retroactive_replacement() {
-    let mut map = BufferedHistoricLayerCoverage::new();
-
-    let keys = [
-        LayerKey {
-            key: 0..5,
-            lsn: 100..101,
-            is_image: true,
-        },
-        LayerKey {
-            key: 3..9,
-            lsn: 110..111,
-            is_image: true,
-        },
-        LayerKey {
-            key: 4..6,
-            lsn: 120..121,
-            is_image: true,
-        },
-    ];
-
-    let layers = [
-        "Image 1".to_string(),
-        "Image 2".to_string(),
-        "Image 3".to_string(),
-    ];
-
-    for (key, layer) in keys.iter().zip(layers.iter()) {
-        map.insert(key.to_owned(), layer.to_owned());
-    }
-
-    // rebuild is not necessary here, because replace works for both buffered updates and existing
-    // layers.
-
-    for (key, orig_layer) in keys.iter().zip(layers.iter()) {
-        let replacement = format!("Remote {orig_layer}");
-
-        // evict
-        let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
-        assert!(
-            matches!(ret, Replacement::Replaced { .. }),
-            "replace {orig_layer}: {ret:?}"
-        );
-        map.rebuild();
-
-        let at = key.lsn.end + 1;
-
-        let version = map.get().expect("rebuilt").get_version(at).unwrap();
-        assert_eq!(
-            version.image_coverage.query(4).as_deref(),
-            Some(replacement.as_str()),
-            "query for 4 at version {at} after eviction",
-        );
-
-        // download
-        let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
-        assert!(
-            matches!(ret, Replacement::Replaced { .. }),
-            "replace {orig_layer} back: {ret:?}"
-        );
-        map.rebuild();
-        let version = map.get().expect("rebuilt").get_version(at).unwrap();
-        assert_eq!(
-            version.image_coverage.query(4).as_deref(),
-            Some(orig_layer.as_str()),
-            "query for 4 at version {at} after download",
-        );
-    }
-}
-
-#[test]
-fn missing_key_is_not_inserted_with_replace() {
-    let mut map = BufferedHistoricLayerCoverage::new();
-    let key = LayerKey {
-        key: 0..5,
-        lsn: 100..101,
-        is_image: true,
-    };
-
-    let ret = map.replace(&key, "should not replace", |_| true);
-    assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
-    map.rebuild();
-    assert!(map
-        .get()
-        .expect("no changes to rebuild")
-        .get_version(102)
-        .is_none());
-}
-
-#[test]
-fn replacing_buffered_insert_and_remove() {
-    let mut map = BufferedHistoricLayerCoverage::new();
-    let key = LayerKey {
-        key: 0..5,
-        lsn: 100..101,
-        is_image: true,
-    };
-
-    map.insert(key.clone(), "Image 1");
-    let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
-    assert!(
-        matches!(ret, Replacement::Replaced { in_buffered: true }),
-        "{ret:?}"
-    );
-    map.rebuild();
-
-    assert_eq!(
-        map.get()
-            .expect("rebuilt")
-            .get_version(102)
-            .unwrap()
-            .image_coverage
-            .query(4),
-        Some("Remote Image 1")
-    );
-
-    map.remove(key.clone());
-    let ret = map.replace(&key, "should not replace", |_| true);
-    assert!(
-        matches!(ret, Replacement::RemovalBuffered),
-        "cannot replace after scheduled remove: {ret:?}"
-    );
-
-    map.rebuild();
-
-    let ret = map.replace(&key, "should not replace", |_| true);
-    assert!(
-        matches!(ret, Replacement::NotFound),
-        "cannot replace after remove + rebuild: {ret:?}"
-    );
-
-    let at_version = map.get().expect("rebuilt").get_version(102);
-    assert!(at_version.is_none());
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -675,7 +675,7 @@ pub async fn immediate_gc(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
@@ -724,11 +724,11 @@ pub async fn immediate_compact(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -608,10 +608,7 @@ impl RemoteTimelineClient {
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!(
-            "scheduled layer file upload {}",
-            layer_file_name.file_name()
-        );
+        info!("scheduled layer file upload {layer_file_name}");

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -664,7 +661,7 @@ impl RemoteTimelineClient {
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {}", name.file_name());
+                info!("scheduled layer file deletion {name}");
            }

            // Launch the tasks immediately, if possible
@@ -828,7 +825,7 @@ impl RemoteTimelineClient {
                    .queued_operations
                    .push_back(op);

-                info!("scheduled layer file deletion {}", name.file_name());
+                info!("scheduled layer file deletion {name}");
                deletions_queued += 1;
            }

@@ -862,10 +859,8 @@ impl RemoteTimelineClient {
                "Found {} files not bound to index_file.json, proceeding with their deletion",
                remaining.len()
            );
-            for file in remaining {
-                warn!("Removing {}", file.object_name().unwrap_or_default());
-                self.storage_impl.delete(&file).await?;
-            }
+            warn!("About to remove {} files", remaining.len());
+            self.storage_impl.delete_objects(&remaining).await?;
        }

        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
@@ -1367,7 +1362,7 @@ mod tests {
    struct TestSetup {
        runtime: &'static tokio::runtime::Runtime,
        entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness<'static>,
+        harness: TenantHarness,
        tenant: Arc<Tenant>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -176,13 +176,10 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn for_loading_layer<L>(
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+    pub(crate) fn for_loading_layer(
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        status: LayerResidenceStatus,
-    ) -> Self
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> Self {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
        new.record_residence_event(
            layer_map_lock_held_witness,
@@ -197,14 +194,11 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn clone_for_residence_change<L>(
+    pub(crate) fn clone_for_residence_change(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> LayerAccessStats {
        let clone = {
            let inner = self.0.lock().unwrap();
            inner.clone()
@@ -232,14 +226,12 @@ impl LayerAccessStats {
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
    ///
-    pub(crate) fn record_residence_event<L>(
+    pub(crate) fn record_residence_event(
        &self,
-        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_>,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
-    ) where
-        L: ?Sized + Layer,
-    {
+    ) {
        let mut locked = self.0.lock().unwrap();
        locked.iter_mut().for_each(|inner| {
            inner
@@ -343,7 +335,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + Send + Sync {
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -381,9 +373,6 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
        ctx: &RequestContext,
    ) -> Result<ValueReconstructResult>;

-    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
-    fn short_id(&self) -> String;
-
    /// Dump summary of the contents of the layer to stdout
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }
@@ -473,94 +462,127 @@ pub fn downcast_remote_layer(
    }
 }

-/// Holds metadata about a layer without any content. Used mostly for testing.
-///
-/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
-/// LayerDescriptor.
-#[derive(Clone, Debug)]
-pub struct LayerDescriptor {
-    pub key: Range<Key>,
-    pub lsn: Range<Lsn>,
-    pub is_incremental: bool,
-    pub short_id: String,
-}
+pub mod tests {
+    use super::*;

-impl LayerDescriptor {
-    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
-    /// and the tenant / timeline id does not matter.
-    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
-        PersistentLayerDesc::new_delta(
-            TenantId::from_array([0; 16]),
-            TimelineId::from_array([0; 16]),
-            self.key.clone(),
-            self.lsn.clone(),
-            233,
-        )
-    }
-}
-
-impl Layer for LayerDescriptor {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key.clone()
+    /// Holds metadata about a layer without any content. Used mostly for testing.
+    ///
+    /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
+    /// LayerDescriptor.
+    #[derive(Clone, Debug)]
+    pub struct LayerDescriptor {
+        base: PersistentLayerDesc,
    }

-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn.clone()
-    }
-
-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        todo!("This method shouldn't be part of the Layer trait")
-    }
-
-    fn short_id(&self) -> String {
-        self.short_id.clone()
-    }
-
-    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        todo!()
-    }
-}
-
-impl From<DeltaFileName> for LayerDescriptor {
-    fn from(value: DeltaFileName) -> Self {
-        let short_id = value.to_string();
-        LayerDescriptor {
-            key: value.key_range,
-            lsn: value.lsn_range,
-            is_incremental: true,
-            short_id,
+    impl From<PersistentLayerDesc> for LayerDescriptor {
+        fn from(base: PersistentLayerDesc) -> Self {
+            Self { base }
        }
    }
-}

-impl From<ImageFileName> for LayerDescriptor {
-    fn from(value: ImageFileName) -> Self {
-        let short_id = value.to_string();
-        let lsn = value.lsn_as_range();
-        LayerDescriptor {
-            key: value.key_range,
-            lsn,
-            is_incremental: false,
-            short_id,
+    impl Layer for LayerDescriptor {
+        fn get_value_reconstruct_data(
+            &self,
+            _key: Key,
+            _lsn_range: Range<Lsn>,
+            _reconstruct_data: &mut ValueReconstructState,
+            _ctx: &RequestContext,
+        ) -> Result<ValueReconstructResult> {
+            todo!("This method shouldn't be part of the Layer trait")
+        }
+
+        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+            todo!()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn get_key_range(&self) -> Range<Key> {
+            self.layer_desc().key_range.clone()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn get_lsn_range(&self) -> Range<Lsn> {
+            self.layer_desc().lsn_range.clone()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn is_incremental(&self) -> bool {
+            self.layer_desc().is_incremental
        }
    }
-}

-impl From<LayerFileName> for LayerDescriptor {
-    fn from(value: LayerFileName) -> Self {
-        match value {
-            LayerFileName::Delta(d) => Self::from(d),
-            LayerFileName::Image(i) => Self::from(i),
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    impl std::fmt::Display for LayerDescriptor {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "{}", self.layer_desc().short_id())
+        }
+    }
+
+    impl PersistentLayer for LayerDescriptor {
+        fn layer_desc(&self) -> &PersistentLayerDesc {
+            &self.base
+        }
+
+        fn local_path(&self) -> Option<PathBuf> {
+            unimplemented!()
+        }
+
+        fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
+            unimplemented!()
+        }
+
+        fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
+            unimplemented!()
+        }
+
+        fn delete_resident_layer_file(&self) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
+            unimplemented!()
+        }
+
+        fn access_stats(&self) -> &LayerAccessStats {
+            unimplemented!()
+        }
+    }
+
+    impl From<DeltaFileName> for LayerDescriptor {
+        fn from(value: DeltaFileName) -> Self {
+            LayerDescriptor {
+                base: PersistentLayerDesc::new_delta(
+                    TenantId::from_array([0; 16]),
+                    TimelineId::from_array([0; 16]),
+                    value.key_range,
+                    value.lsn_range,
+                    233,
+                ),
+            }
+        }
+    }
+
+    impl From<ImageFileName> for LayerDescriptor {
+        fn from(value: ImageFileName) -> Self {
+            LayerDescriptor {
+                base: PersistentLayerDesc::new_img(
+                    TenantId::from_array([0; 16]),
+                    TimelineId::from_array([0; 16]),
+                    value.key_range,
+                    value.lsn,
+                    false,
+                    233,
+                ),
+            }
+        }
+    }
+
+    impl From<LayerFileName> for LayerDescriptor {
+        fn from(value: LayerFileName) -> Self {
+            match value {
+                LayerFileName::Delta(d) => Self::from(d),
+                LayerFileName::Image(i) => Self::from(i),
+            }
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -394,10 +394,11 @@ impl Layer for DeltaLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
-        self.layer_desc().short_id()
+}
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for DeltaLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
    }
 }

--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -210,9 +210,15 @@ pub enum LayerFileName {

 impl LayerFileName {
    pub fn file_name(&self) -> String {
+        self.to_string()
+    }
+}
+
+impl fmt::Display for LayerFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            Self::Image(fname) => fname.to_string(),
-            Self::Delta(fname) => fname.to_string(),
+            Self::Image(fname) => write!(f, "{fname}"),
+            Self::Delta(fname) => write!(f, "{fname}"),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -230,10 +230,12 @@ impl Layer for ImageLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
+}

-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
-        self.layer_desc().short_id()
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -131,13 +131,6 @@ impl Layer for InMemoryLayer {
        true
    }

-    fn short_id(&self) -> String {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
-        format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();
@@ -240,6 +233,15 @@ impl Layer for InMemoryLayer {
    }
 }

+impl std::fmt::Display for InMemoryLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+    }
+}
+
 impl InMemoryLayer {
    ///
    /// Get layer size on the disk
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,5 @@
 use anyhow::Result;
+use core::fmt::Display;
 use std::ops::Range;
 use utils::{
    id::{TenantId, TimelineId},
@@ -48,8 +49,8 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn short_id(&self) -> String {
-        self.filename().file_name()
+    pub fn short_id(&self) -> impl Display {
+        self.filename()
    }

    #[cfg(test)]
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -71,10 +71,7 @@ impl Layer for RemoteLayer {
        _reconstruct_state: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
-        bail!(
-            "layer {} needs to be downloaded",
-            self.filename().file_name()
-        );
+        bail!("layer {self} needs to be downloaded");
    }

    /// debugging function to print out the contents of the layer
@@ -106,10 +103,12 @@ impl Layer for RemoteLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
+}

-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
-        self.layer_desc().short_id()
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
    }
 }

@@ -218,15 +217,12 @@ impl RemoteLayer {
    }

    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer<L>(
+    pub fn create_downloaded_layer(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        conf: &'static PageServerConf,
        file_size: u64,
-    ) -> Arc<dyn PersistentLayer>
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> Arc<dyn PersistentLayer> {
        if self.desc.is_delta {
            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -70,7 +70,6 @@ impl Timeline {
                };

                self_clone.eviction_task(cancel).await;
-                info!("eviction task finishing");
                Ok(())
            },
        );
@@ -78,6 +77,9 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        scopeguard::defer! {
+            info!("eviction task finishing");
+        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
@@ -86,7 +88,6 @@ impl Timeline {
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
            if random_init_delay(period, &cancel).await.is_err() {
-                info!("shutting down");
                return;
            }
        }
@@ -101,7 +102,6 @@ impl Timeline {
                ControlFlow::Continue(sleep_until) => {
                    tokio::select! {
                        _ = cancel.cancelled() => {
-                            info!("shutting down");
                            break;
                        }
                        _ = tokio::time::sleep_until(sleep_until) => { }
@@ -197,9 +197,11 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().await;
+            let guard = self.layers.read().await;
+            let (layers, mapping) = &*guard;
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
+                let hist_layer = mapping.get_from_desc(&hist_layer);
                if hist_layer.is_remote_layer() {
                    continue;
                }
@@ -207,7 +209,7 @@ impl Timeline {
                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now");
+                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
                    SystemTime::now()
                });

--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1321,7 +1321,7 @@ mod tests {

    const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";

-    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
+    async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -71,6 +71,8 @@ pub(super) async fn handle_walreceiver_connection(
    ctx: RequestContext,
    node: NodeId,
 ) -> anyhow::Result<()> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
@@ -140,6 +142,9 @@ pub(super) async fn handle_walreceiver_connection(
            }
            Ok(())
        }
+        // Enrich the log lines emitted by this closure with meaningful context.
+        // TODO: technically, this task outlives the surrounding function, so, the
+        // spans won't be properly nested.
        .instrument(tracing::info_span!("poller")),
    );

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -302,15 +302,6 @@ impl VirtualFile {
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
-        //
-        // TODO: We could downgrade the locks to read mode before calling
-        // 'func', to allow a little bit more concurrency, but the standard
-        // library RwLock doesn't allow downgrading without releasing the lock,
-        // and that doesn't seem worth the trouble.
-        //
-        // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and
-        // may deadlock on subsequent read calls.
-        // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly.
        let result = STORAGE_IO_TIME
            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| func(&file));
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -122,6 +122,43 @@ hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
 						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
 }

+#ifdef __APPLE__
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static void
+hnsw_check_available_memory(Size requested)
+{
+	size_t total;
+	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
+		elog(ERROR, "Failed to get amount of RAM: %m");
+
+	if ((Size)NBuffers*BLCKSZ + requested >= total)
+		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
+			requested, total - (Size)NBuffers*BLCKSZ);
+}
+
+#else
+
+#include <sys/sysinfo.h>
+
+static void
+hnsw_check_available_memory(Size requested)
+{
+	struct sysinfo si;
+	Size total;
+	if (sysinfo(&si) < 0)
+		elog(ERROR, "Failed to get amount of RAM: %n");
+
+	total = si.totalram*si.mem_unit;
+	if ((Size)NBuffers*BLCKSZ + requested >= total)
+		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
+			requested, total - (Size)NBuffers*BLCKSZ);
+}
+
+#endif
+
 static HierarchicalNSW*
 hnsw_get_index(Relation indexRel, Relation heapRel)
 {
@@ -156,6 +193,8 @@ hnsw_get_index(Relation indexRel, Relation heapRel)
 		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
 		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;

+		hnsw_check_available_memory(shmem_size);
+
 		/* first try to attach to existed index */
 		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
 						 &mapped_address, &mapped_size, DEBUG1))
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,4 +1,4 @@
-comment = 'hNsw index'
+comment = 'hnsw index'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -8,7 +8,6 @@ OBJS = \
 	libpagestore.o \
 	libpqwalproposer.o \
 	neon.o \
-	access_stat.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
--- a/pgxn/neon/access_stat.c
+++ b/pgxn/neon/access_stat.c
@@ -1,274 +0,0 @@
-
-/*
- * We want this statistic to rpresent current access patern mthis is why when
- * (n_seq_accesses + n_rnd_accesses) > MAX_ACCESS_COUNTER then we divide both counters by two,
- * so decreasng weight of historical data
- */
-#include "postgres.h"
-#include "funcapi.h"
-#include "miscadmin.h"
-#include "common/hashfn.h"
-#include "pagestore_client.h"
-#include "storage/relfilenode.h"
-#include "utils/guc.h"
-
-/* Structure used to predict sequential access */
-
-typedef struct AccessStatEntry {
-	RelFileNode relnode;
-	BlockNumber blkno; /* last accessed black number */
-	uint32      n_seq_accesses; /* number of sequential accesses (when block N+1 is accessed after block N) */
-	uint32      n_rnd_accesses; /* number of random accesses */
-	uint32      hash;
-	uint32      status;
-	uint64      access_count;  /* total number of relation accesses since backend start */
-	dlist_node	lru_node; /* LRU list node */
-}  AccessStatEntry;
-
-#define SH_PREFIX			as
-#define SH_ELEMENT_TYPE		AccessStatEntry
-#define SH_KEY_TYPE			RelFileNode
-#define SH_KEY				relnode
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	((a)->hash)
-#define SH_HASH_KEY(tb, key) hash_bytes(		\
-		((const unsigned char *) &(key)),		\
-		sizeof(RelFileNode)					\
-)
-
-#define SH_EQUAL(tb, a, b)  RelFileNodeEquals((a), (b))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-static as_hash *hash;
-static dlist_head lru;
-static int max_access_stat_size;
-static int max_access_stat_count;
-static double min_seq_access_ratio;
-static int min_seq_access_count;
-
-void access_stat_init(void)
-{
-	MemoryContext memctx = AllocSetContextCreate(TopMemoryContext,
-												 "NeonSMGR/access_stat",
-												 ALLOCSET_DEFAULT_SIZES);
-	DefineCustomIntVariable("neon.max_access_stat_size",
-							"Maximal size of Neon relation access statistic hash",
-							NULL,
-							&max_access_stat_size,
-							1024,
-							0,
-							INT_MAX,
-							PGC_USERSET,
-							0,
-							NULL,
-							NULL,
-							NULL);
-	DefineCustomIntVariable("neon.max_access_stat_count",
-							"Maximal value of relation access counter after which counters are divided by 2",
-							NULL,
-							&max_access_stat_count,
-							1024,
-							0,
-							INT_MAX,
-							PGC_USERSET,
-							0,
-							NULL,
-							NULL,
-							NULL);
-	DefineCustomRealVariable("neon.min_seq_access_ratio",
-							 "Minimal seq/(rnd+seq) ratio to determine sequential access",
-							 NULL,
-							 &min_seq_access_ratio,
-							 0.9,
-							 0,
-							 INT_MAX,
-							 PGC_USERSET,
-							 0,
-							 NULL,
-							 NULL,
-							 NULL);
-	DefineCustomIntVariable("neon.min_seq_access_count",
-							"Minimal access count to determine sequetial access",
-							NULL,
-							&min_seq_access_count,
-							10,
-							0,
-							INT_MAX,
-							PGC_USERSET,
-							0,
-							NULL,
-							NULL,
-							NULL);
-	hash = as_create(memctx, max_access_stat_size, NULL);
-	dlist_init(&lru);
-}
-
-
-bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
-{
-	bool is_seq_access = false;
-	if (forkNum == MAIN_FORKNUM /* prefetch makes sense only for main fork */
-		&& max_access_stat_size != 0)
-	{
-		AccessStatEntry* entry = as_lookup(hash, rnode);
-		if (entry == NULL)
-		{
-			bool found;
-			/* New item */
-			while (hash->members >= max_access_stat_size)
-			{
-				/* Hash overflow: find candidate for replacement */
-				AccessStatEntry* victim = dlist_container(AccessStatEntry, lru_node, dlist_pop_head_node(&lru));
-				as_delete_item(hash, victim);
-			}
-			entry = as_insert(hash, rnode, &found);
-			Assert(!found);
-			/* Set both counter to zero because we don't know whethr first access is sequential or random */
-			entry->n_seq_accesses = 0;
-			entry->n_rnd_accesses = 0;
-			entry->access_count = 1;
-		}
-		else
-		{
-			uint32 access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
-			/*
-			 * We want this function to represent most recent access pattern,
-			 * so when number of accesses exceed threashold value `max_access_stat_count`
-			 * we divide bother coutners by two devaluing old data
-			 */
-			if (access_count >= max_access_stat_count)
-			{
-				entry->n_seq_accesses >>= 1;
-				entry->n_rnd_accesses >>= 1;
-			}
-			if (entry->blkno+1 == blkno)
-				entry->n_seq_accesses += 1;
-			else
-				entry->n_rnd_accesses += 1;
-			entry->access_count += 1;
-			access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
-
-			is_seq_access = access_count >= min_seq_access_count
-				&& (double)entry->n_seq_accesses / access_count >= min_seq_access_ratio;
-
-
-			/* Remove entry from LRU list tobe able to insert it to the end of this list */
-			dlist_delete(&entry->lru_node);
-		}
-		/* Place entry to the tail of LRU list */
-		dlist_push_tail(&lru, &entry->lru_node);
-		entry->blkno = blkno;
-	}
-	return is_seq_access;
-}
-
-/*
- * Get relation access pattern
- */
-PG_FUNCTION_INFO_V1(get_relation_access_statistics);
-
-
-typedef struct
-{
-	TupleDesc	tupdesc;
-	dlist_node* curr;
-} AccessStatContext;
-
-#define NUM_ACCESS_STAT_COLUMNS 6
-
-Datum
-get_relation_access_statistics(PG_FUNCTION_ARGS)
-{
-	FuncCallContext *funcctx;
-	Datum		result;
-	MemoryContext oldcontext;
-	AccessStatContext *fctx;	/* User function context. */
-	TupleDesc	tupledesc;
-	TupleDesc	expected_tupledesc;
-	HeapTuple	tuple;
-
-	if (SRF_IS_FIRSTCALL())
-	{
-		funcctx = SRF_FIRSTCALL_INIT();
-
-		/* Switch context when allocating stuff to be used in later calls */
-		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-		/* Create a user function context for cross-call persistence */
-		fctx = (AccessStatContext *) palloc(sizeof(AccessStatContext));
-
-		/*
-		 * To smoothly support upgrades from version 1.0 of this extension
-		 * transparently handle the (non-)existence of the pinning_backends
-		 * column. We unfortunately have to get the result type for that... -
-		 * we can't use the result type determined by the function definition
-		 * without potentially crashing when somebody uses the old (or even
-		 * wrong) function definition though.
-		 */
-		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-			elog(ERROR, "return type must be a row type");
-
-		if (expected_tupledesc->natts != NUM_ACCESS_STAT_COLUMNS)
-			elog(ERROR, "incorrect number of output arguments");
-
-		/* Construct a tuple descriptor for the result rows. */
-		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "relfilenode",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "reltablespace",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reldatabase",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "seqaccess",
-						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "rndaccess",
-						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "accesscnt",
-						   INT8OID, -1, 0);
-
-		fctx->tupdesc = BlessTupleDesc(tupledesc);
-		fctx->curr = dlist_is_empty(&lru) ? NULL : dlist_tail_node(&lru);
-
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = hash->members;
-		funcctx->user_fctx = fctx;
-
-		/* Return to original context when allocating transient memory */
-		MemoryContextSwitchTo(oldcontext);
-	}
-
-	funcctx = SRF_PERCALL_SETUP();
-
-	/* Get the saved state */
-	fctx = funcctx->user_fctx;
-	if (fctx->curr)
-	{
-		AccessStatEntry* entry = dlist_container(AccessStatEntry, lru_node, fctx->curr);
-		Datum		values[NUM_ACCESS_STAT_COLUMNS];
-		bool		nulls[NUM_ACCESS_STAT_COLUMNS] = {
-			false, false, false, false, false, false
-		};
-
-		values[0] = ObjectIdGetDatum(entry->relnode.relNode);
-		values[1] = ObjectIdGetDatum(entry->relnode.spcNode);
-		values[2] = ObjectIdGetDatum(entry->relnode.dbNode);
-		values[3] = Int32GetDatum(entry->n_seq_accesses);
-		values[4] = Int32GetDatum(entry->n_rnd_accesses);
-		values[5] = Int64GetDatum(entry->access_count);
-
-		/* Build and return the tuple. */
-		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
-		result = HeapTupleGetDatum(tuple);
-
-		fctx->curr = dlist_has_prev(&lru, fctx->curr) ? dlist_prev_node(&lru, fctx->curr) : NULL;
-
-		SRF_RETURN_NEXT(funcctx, result);
-	}
-	else
-		SRF_RETURN_DONE(funcctx);
-}
-
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -32,6 +32,7 @@
 #include "port.h"
 #include <curl/curl.h>
 #include "utils/jsonb.h"
+#include "libpq/crypt.h"

 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;

@@ -161,7 +162,22 @@ ConstructDeltaMessage()
 			PushKeyValue(&state, "name", entry->name);
 			if (entry->password)
 			{
+#if PG_MAJORVERSION_NUM == 14
+				char	   *logdetail;
+#else
+				const char *logdetail;
+#endif
 				PushKeyValue(&state, "password", (char *) entry->password);
+				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
+
+				if (encrypted_password)
+				{
+					PushKeyValue(&state, "encrypted_password", encrypted_password);
+				}
+				else
+				{
+					elog(ERROR, "Failed to get encrypted password: %s", logdetail);
+				}
 			}
 			if (entry->old_name[0] != '\0')
 			{
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -190,7 +190,7 @@ lfc_change_limit_hook(int newval, void *extra)
 		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
 		lfc_ctl->used -= 1;
 	}
-	elog(LOG, "set local file cache limit to %d", new_size);
+	elog(DEBUG1, "set local file cache limit to %d", new_size);
 	LWLockRelease(lfc_lock);
 }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -58,7 +58,6 @@ char	   *neon_auth_token;
 int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
 int			readahead_buffer_size = 128;
-int         readahead_distance = 10;

 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

@@ -453,18 +452,6 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
-	DefineCustomIntVariable("neon.readahead_distance",
-							"Number of read-ahead blocks",
-							NULL,
-							&readahead_distance,
-							10,
-							0,
-							INT_MAX,
-							PGC_USERSET,
-							0,
-							NULL,
-							NULL,
-							NULL);

 	relsize_hash_init();

@@ -489,5 +476,4 @@ pg_init_libpagestore(void)
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 	lfc_init();
-	access_stat_init();
 }
--- a/pgxn/neon/neon--1.0.sql
+++ b/pgxn/neon/neon--1.0.sql
@@ -27,18 +27,12 @@ RETURNS SETOF RECORD
 AS 'MODULE_PATHNAME', 'local_cache_pages'
 LANGUAGE C PARALLEL SAFE;

-CREATE FUNCTION get_relation_access_statistics()
-RETURNS SETOF RECORD
-AS 'MODULE_PATHNAME', 'get_relation_access_statistics'
-LANGUAGE C PARALLEL SAFE;
-
 -- Create a view for convenient access.
 CREATE VIEW local_cache AS
-	SELECT relname,P.* FROM local_cache_pages() AS P
+	SELECT P.* FROM local_cache_pages() AS P
 	(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
-	 relforknumber int2, relblocknumber int8, accesscount int4) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
+	 relforknumber int2, relblocknumber int8, accesscount int4);

-CREATE VIEW relation_access_statistics AS
-	SELECT relname,P.* FROM get_relation_access_statistics() AS P
-	(relfilenode oid, reltablespace oid, reldatabase oid,
-	 seqaccess int4, rndaccess int4, access_count int8) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
+CREATE FUNCTION copy_from(conninfo cstring) RETURNS BIGINT
+AS 'MODULE_PATHNAME', 'copy_from'
+LANGUAGE C;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -13,20 +13,32 @@

 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/relation.h"
+#include "access/xloginsert.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
+#include "catalog/namespace.h"
 #include "replication/walsender.h"
 #include "funcapi.h"
+#include "miscadmin.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
+#include "utils/wait_event.h"
+#include "utils/rel.h"
+#include "utils/varlena.h"
+#include "utils/builtins.h"

 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
 #include "control_plane_connector.h"

+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "libpq/libpq.h"
+
 PG_MODULE_MAGIC;
 void		_PG_init(void);

@@ -46,6 +58,7 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(copy_from);

 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -91,3 +104,281 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
+
+
+#define N_RAW_PAGE_COLUMNS 4
+#define COPY_FETCH_COUNT   16
+
+
+static void
+report_error(int elevel, PGresult *res, PGconn *conn,
+			 bool clear, const char *sql)
+{
+	/* If requested, PGresult must be released before leaving this function. */
+	PG_TRY();
+	{
+		char	   *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+		char	   *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY);
+		char	   *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL);
+		char	   *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT);
+		char	   *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT);
+		int			sqlstate;
+
+		if (diag_sqlstate)
+			sqlstate = MAKE_SQLSTATE(diag_sqlstate[0],
+									 diag_sqlstate[1],
+									 diag_sqlstate[2],
+									 diag_sqlstate[3],
+									 diag_sqlstate[4]);
+		else
+			sqlstate = ERRCODE_CONNECTION_FAILURE;
+
+		/*
+		 * If we don't get a message from the PGresult, try the PGconn.  This
+		 * is needed because for connection-level failures, PQexec may just
+		 * return NULL, not a PGresult at all.
+		 */
+		if (message_primary == NULL)
+			message_primary = pchomp(PQerrorMessage(conn));
+
+		ereport(elevel,
+				(errcode(sqlstate),
+				 (message_primary != NULL && message_primary[0] != '\0') ?
+				 errmsg_internal("%s", message_primary) :
+				 errmsg("could not obtain message string for remote error"),
+				 message_detail ? errdetail_internal("%s", message_detail) : 0,
+				 message_hint ? errhint("%s", message_hint) : 0,
+				 message_context ? errcontext("%s", message_context) : 0,
+				 sql ? errcontext("remote SQL command: %s", sql) : 0));
+	}
+	PG_FINALLY();
+	{
+		if (clear)
+			PQclear(res);
+	}
+	PG_END_TRY();
+}
+
+static PGresult *
+get_result(PGconn *conn, const char *query)
+{
+	PGresult   *volatile last_res = NULL;
+
+	/* In what follows, do not leak any PGresults on an error. */
+	PG_TRY();
+	{
+		for (;;)
+		{
+			PGresult   *res;
+
+			while (PQisBusy(conn))
+			{
+				int			wc;
+
+				/* Sleep until there's something to do */
+				wc = WaitLatchOrSocket(MyLatch,
+									   WL_LATCH_SET | WL_SOCKET_READABLE |
+									   WL_EXIT_ON_PM_DEATH,
+									   PQsocket(conn),
+									   -1L, PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				CHECK_FOR_INTERRUPTS();
+
+				/* Data available in socket? */
+				if (wc & WL_SOCKET_READABLE)
+				{
+					if (!PQconsumeInput(conn))
+						report_error(ERROR, NULL, conn, false, query);
+				}
+			}
+
+			res = PQgetResult(conn);
+			if (res == NULL)
+				break;			/* query is complete */
+
+			PQclear(last_res);
+			last_res = res;
+		}
+	}
+	PG_CATCH();
+	{
+		PQclear(last_res);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	return last_res;
+}
+
+#define CREATE_COPYDATA_FUNC "\
+create or replace function copydata() returns setof record as $$ \
+declare \
+    relsize integer; \
+    total_relsize integer; \
+	content bytea; \
+	r record; \
+	fork text; \
+	relname text; \
+	pagesize integer; \
+begin \
+	pagesize = current_setting('block_size'); \
+	for r in select oid,reltoastrelid from pg_class where relnamespace not in (select oid from pg_namespace where nspname in ('pg_catalog','pg_toast','information_schema')) \
+	loop \
+		relname = r.oid::regclass::text; \
+        total_relsize = 0; \
+	    foreach fork in array array['main','vm','fsm'] \
+		loop \
+		    relsize = pg_relation_size(r.oid, fork)/pagesize; \
+			total_relsize = total_relsize + relsize; \
+	        for p in 1..relsize \
+		    loop \
+			    content = get_raw_page(relname, fork, p-1); \
+				return next row(relname,fork,p-1,content); \
+			end loop; \
+		end loop; \
+        if total_relsize <> 0 and r.reltoastrelid <> 0 then \
+            foreach relname in array array ['pg_toast.pg_toast_'||r.oid, 'pg_toast.pg_toast_'||r.oid||'_index'] \
+			loop \
+		    	foreach fork in array array['main','vm','fsm'] \
+				loop \
+			    	relsize = pg_relation_size(relname, fork)/pagesize; \
+	        		for p in 1..relsize \
+		    		loop \
+			    		content = get_raw_page(relname, fork, p-1); \
+						return next row(relname,fork,p-1,content); \
+					end loop; \
+				end loop; \
+			end loop; \
+        end if; \
+	end loop; \
+end; \
+$$ language plpgsql"
+
+Datum
+copy_from(PG_FUNCTION_ARGS)
+{
+	char const* conninfo = PG_GETARG_CSTRING(0);
+	PGconn* conn;
+	char const* declare_cursor = "declare copy_data_cursor no scroll cursor for select * from copydata() as raw_page(relid text, fork text, blkno integer, content bytea)";
+	char* fetch_cursor = psprintf("fetch forward %d copy_data_cursor", COPY_FETCH_COUNT);
+	char const* close_cursor = "close copy_data_cursor";
+	char const* vacuum_freeze = "vacuum freeze";
+	char   *content;
+	char const* relname;
+	BlockNumber blkno;
+	ForkNumber forknum;
+	BlockNumber prev_blkno = InvalidBlockNumber;
+	RangeVar   *relrv;
+	Relation rel = NULL;
+	BlockNumber rel_size;
+	int64_t total = 0;
+	PGresult   *res;
+	char blkno_buf[4];
+	int n_tuples;
+	Buffer buf;
+	char* toast_rel_name;
+	Oid relid = InvalidOid;
+
+	/* Connect to the source database */
+	conn = PQconnectdb(conninfo);
+	if (!conn || PQstatus(conn) != CONNECTION_OK)
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("could not connect to server \"%s\"",
+						conninfo),
+				 errdetail_internal("%s", pchomp(PQerrorMessage(conn)))));
+
+	/* First create store procedure (assumes that pageinspector extension is already installed) */
+	res = PQexec(conn, CREATE_COPYDATA_FUNC);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, CREATE_COPYDATA_FUNC);
+	PQclear(res);
+
+	/* Freeze all tables to prevent problems with XID mapping */
+	res = PQexec(conn, vacuum_freeze);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, vacuum_freeze);
+	PQclear(res);
+
+	/* Start transaction to use cursor */
+	res = PQexec(conn, "BEGIN");
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, "BEGIN");
+	PQclear(res);
+
+	/* Declare cursor (we have to use cursor to avoid materializing all database in memory) */
+	res = PQexec(conn, declare_cursor);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, declare_cursor);
+	PQclear(res);
+
+	/* Get database data */
+	while ((res = PQexecParams(conn, fetch_cursor, 0, NULL, NULL, NULL, NULL, 1)) != NULL)
+	{
+		if (PQresultStatus(res) != PGRES_TUPLES_OK)
+			report_error(ERROR, res, conn, true, fetch_cursor);
+
+		n_tuples = PQntuples(res);
+		if (PQnfields(res) != 4)
+			elog(ERROR, "unexpected result from copydata()");
+
+		for (int i = 0; i < n_tuples; i++)
+		{
+			relname = PQgetvalue(res, i, 0);
+			forknum = forkname_to_number(PQgetvalue(res, i, 1));
+			memcpy(&blkno, PQgetvalue(res, i, 2), sizeof(BlockNumber));
+			blkno = pg_ntoh32(blkno);
+			content = (char*)PQgetvalue(res, i, 3);
+
+			if (blkno <= prev_blkno)
+			{
+				if (forknum == MAIN_FORKNUM)
+				{
+					char* dst_rel_name = strncmp(relname, "pg_toast.", 9) == 0
+						/* Construct correct TOAST table name */
+						? psprintf("pg_toast.pg_toast_%u%s",
+								   relid,
+								   strcmp(relname + strlen(relname) - 5, "index") == 0 ? "_index" : "")
+						: (char*)relname;
+					if (rel)
+						relation_close(rel, AccessExclusiveLock);
+					relrv = makeRangeVarFromNameList(textToQualifiedNameList(cstring_to_text(dst_rel_name)));
+					rel = relation_openrv(relrv, AccessExclusiveLock);
+					if (dst_rel_name != relname)
+						pfree(dst_rel_name);
+					else
+						relid = RelationGetRelid(rel);
+				}
+				rel_size = RelationGetNumberOfBlocksInFork(rel, forknum);
+			}
+			buf = ReadBufferExtended(rel, forknum, blkno < rel_size ? blkno : P_NEW, RBM_ZERO_AND_LOCK, NULL);
+			MarkBufferDirty(buf);
+			memcpy(BufferGetPage(buf), content, BLCKSZ);
+			log_newpage_buffer(buf, forknum == MAIN_FORKNUM);
+			UnlockReleaseBuffer(buf);
+
+			total += 1;
+			prev_blkno = blkno;
+		}
+		PQclear(res);
+		if (n_tuples < COPY_FETCH_COUNT)
+			break;
+	}
+	res = PQexec(conn, close_cursor);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, close_cursor);
+	PQclear(res);
+
+	if (rel)
+		relation_close(rel, AccessExclusiveLock);
+
+	/* Complete transaction */
+	res = PQexec(conn, "END");
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, "END");
+	PQclear(res);
+
+	PQfinish(conn);
+	PG_RETURN_INT64(total);
+}
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -157,7 +157,6 @@ extern page_server_api * page_server;
 extern char *page_server_connstring;
 extern int flush_every_n_requests;
 extern int readahead_buffer_size;
-extern int readahead_distance;
 extern bool seqscan_prefetch_enabled;
 extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
@@ -211,8 +210,5 @@ extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumbe
 extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);

-/* Access statistic */
-extern void access_stat_init(void);
-extern bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);

 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1881,7 +1881,6 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
 		XLogWaitForReplayOf(request_lsn);

-
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
@@ -2004,10 +2003,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	/* If it is expected to be sequential access then initiate prefetch of next block */
-	if (is_sequential_access(reln->smgr_rnode.node, forkNum, blkno))
-		neon_prefetch(reln, forkNum, blkno + readahead_distance);
-
 	/* Try to read from local file cache */
 	if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
 	{
@@ -2680,7 +2675,6 @@ bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
-	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
 	RelFileNode	rnode;
 	ForkNumber	forknum;
 	BlockNumber	blkno;
@@ -2724,16 +2718,15 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	no_redo_needed = buffer < 0;

-	/* we don't have the buffer in memory, update lwLsn past this record */
+	/* In both cases st lwlsn past this WAL record */
+	SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
+
+	/* we don't have the buffer in memory, update lwLsn past this record,
+	 * also evict page fro file cache
+	 */
 	if (no_redo_needed)
-	{
-		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
 		lfc_evict(rnode, forknum, blkno);
-	}
-	else
-	{
-		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
-	}
+

 	LWLockRelease(partitionLock);

@@ -2741,7 +2734,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	if (get_cached_relsize(rnode, forknum, &relsize))
 	{
 		if (relsize < blkno + 1)
+		{
 			update_cached_relsize(rnode, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
+		}
 	}
 	else
 	{
@@ -2773,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		Assert(nbresponse->n_blocks > blkno);

 		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);

 		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
 	}
--- a/poetry.lock
+++ b/poetry.lock
@@ -1654,71 +1654,74 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]

 [[package]]
 name = "psycopg2-binary"
-version = "2.9.3"
+version = "2.9.6"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"},
+    {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"},
 ]

 [[package]]
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
-atty.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -30,6 +29,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
+pbkdf2.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -38,6 +38,7 @@ rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 reqwest-middleware.workspace = true
+reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -136,18 +136,17 @@ impl Default for ConnCfg {

 impl ConnCfg {
    /// Establish a raw TCP connection to the compute node.
-    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
+    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
        use tokio_postgres::config::Host;

        // wrap TcpStream::connect with timeout
        let connect_with_timeout = |host, port| {
-            let connection_timeout = Duration::from_millis(10000);
-            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
+            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
                move |res| match res {
                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
                    Err(_) => Err(io::Error::new(
                        io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {connection_timeout:?}"),
+                        format!("exceeded connection timeout {timeout:?}"),
                    )),
                },
            )
@@ -223,8 +222,9 @@ impl ConnCfg {
    async fn do_connect(
        &self,
        allow_self_signed_compute: bool,
+        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let (socket_addr, stream, host) = self.connect_raw().await?;
+        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;

        let tls_connector = native_tls::TlsConnector::builder()
            .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -264,8 +264,9 @@ impl ConnCfg {
    pub async fn connect(
        &self,
        allow_self_signed_compute: bool,
+        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        self.do_connect(allow_self_signed_compute)
+        self.do_connect(allow_self_signed_compute, timeout)
            .inspect_err(|err| {
                // Immediately log the error we have at our disposal.
                error!("couldn't connect to compute node: {err}");
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -212,7 +212,7 @@ pub struct CacheOptions {

 impl CacheOptions {
    /// Default options for [`crate::auth::caches::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=5m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";

    /// Parse cache options passed via cmdline.
    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -8,7 +8,7 @@ use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, Quer
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::future;
 use tokio::net::{TcpListener, TcpStream};
-use tracing::{error, info, info_span};
+use tracing::{error, info, info_span, Instrument};

 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);

@@ -44,19 +44,30 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> {
            .set_nodelay(true)
            .context("failed to set client socket option")?;

-        tokio::task::spawn(async move {
-            let span = info_span!("mgmt", peer = %peer_addr);
-            let _enter = span.enter();
+        let span = info_span!("mgmt", peer = %peer_addr);

-            info!("started a new console management API thread");
-            scopeguard::defer! {
-                info!("console management API thread is about to finish");
-            }
+        tokio::task::spawn(
+            async move {
+                info!("serving a new console management API connection");

-            if let Err(e) = handle_connection(socket).await {
-                error!("thread failed with an error: {e}");
+                // these might be long running connections, have a separate logging for cancelling
+                // on shutdown and other ways of stopping.
+                let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
+                    let _e = span.entered();
+                    info!("console management API task cancelled");
+                });
+
+                if let Err(e) = handle_connection(socket).await {
+                    error!("serving failed with an error: {e}");
+                } else {
+                    info!("serving completed");
+                }
+
+                // we can no longer get dropped
+                scopeguard::ScopeGuard::into_inner(cancelled);
            }
-        });
+            .instrument(span),
+        );
    }
 }

@@ -77,14 +88,14 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
        pgb: &mut PostgresBackendTCP,
        query: &str,
    ) -> Result<(), QueryError> {
-        try_process_query(pgb, query).await.map_err(|e| {
+        try_process_query(pgb, query).map_err(|e| {
            error!("failed to process response: {e:?}");
            e
        })
    }
 }

-async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
+fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;

    let span = info_span!("event", session_id = resp.session_id);
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,12 +2,16 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).

+pub mod conn_pool;
 pub mod server;
 pub mod sql_over_http;
 pub mod websocket;

+use std::time::Duration;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
+pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};

 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -21,6 +25,24 @@ pub fn new_client() -> ClientWithMiddleware {
        .build()
 }

+pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+    let timeout_client = reqwest::ClientBuilder::new()
+        .timeout(default_timout)
+        .build()
+        .expect("Failed to create http client with timeout");
+
+    let retry_policy =
+        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
+
+    reqwest_middleware::ClientBuilder::new(timeout_client)
+        .with(reqwest_tracing::TracingMiddleware::default())
+        // As per docs, "This middleware always errors when given requests with streaming bodies".
+        // That's all right because we only use this client to send `serde_json::RawValue`, which
+        // is not a stream.
+        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
+        .build()
+}
+
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
 pub struct Endpoint {
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -0,0 +1,278 @@
+use parking_lot::Mutex;
+use pq_proto::StartupMessageParams;
+use std::fmt;
+use std::{collections::HashMap, sync::Arc};
+
+use futures::TryFutureExt;
+
+use crate::config;
+use crate::{auth, console};
+
+use super::sql_over_http::MAX_RESPONSE_SIZE;
+
+use crate::proxy::invalidate_cache;
+use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+
+use tracing::error;
+use tracing::info;
+
+pub const APP_NAME: &str = "sql_over_http";
+const MAX_CONNS_PER_ENDPOINT: usize = 20;
+
+#[derive(Debug)]
+pub struct ConnInfo {
+    pub username: String,
+    pub dbname: String,
+    pub hostname: String,
+    pub password: String,
+}
+
+impl ConnInfo {
+    // hm, change to hasher to avoid cloning?
+    pub fn db_and_user(&self) -> (String, String) {
+        (self.dbname.clone(), self.username.clone())
+    }
+}
+
+impl fmt::Display for ConnInfo {
+    // use custom display to avoid logging password
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname)
+    }
+}
+
+struct ConnPoolEntry {
+    conn: tokio_postgres::Client,
+    _last_access: std::time::Instant,
+}
+
+// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub struct EndpointConnPool {
+    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
+    total_conns: usize,
+}
+
+pub struct GlobalConnPool {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
+
+    // Maximum number of connections per one endpoint.
+    // Can mix different (dbname, username) connections.
+    // When running out of free slots for a particular endpoint,
+    // falls back to opening a new connection for each request.
+    max_conns_per_endpoint: usize,
+
+    proxy_config: &'static crate::config::ProxyConfig,
+}
+
+impl GlobalConnPool {
+    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
+        Arc::new(Self {
+            global_pool: Mutex::new(HashMap::new()),
+            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
+            proxy_config: config,
+        })
+    }
+
+    pub async fn get(
+        &self,
+        conn_info: &ConnInfo,
+        force_new: bool,
+    ) -> anyhow::Result<tokio_postgres::Client> {
+        let mut client: Option<tokio_postgres::Client> = None;
+
+        if !force_new {
+            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+
+            // find a pool entry by (dbname, username) if exists
+            let mut pool = pool.lock();
+            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
+            if let Some(pool_entries) = pool_entries {
+                if let Some(entry) = pool_entries.pop() {
+                    client = Some(entry.conn);
+                    pool.total_conns -= 1;
+                }
+            }
+        }
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                connect_to_compute(self.proxy_config, conn_info).await
+            } else {
+                info!("pool: reusing connection '{conn_info}'");
+                Ok(client)
+            }
+        } else {
+            info!("pool: opening a new connection '{conn_info}'");
+            connect_to_compute(self.proxy_config, conn_info).await
+        }
+    }
+
+    pub async fn put(
+        &self,
+        conn_info: &ConnInfo,
+        client: tokio_postgres::Client,
+    ) -> anyhow::Result<()> {
+        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+
+        // return connection to the pool
+        let mut total_conns;
+        let mut returned = false;
+        let mut per_db_size = 0;
+        {
+            let mut pool = pool.lock();
+            total_conns = pool.total_conns;
+
+            let pool_entries: &mut Vec<ConnPoolEntry> = pool
+                .pools
+                .entry(conn_info.db_and_user())
+                .or_insert_with(|| Vec::with_capacity(1));
+            if total_conns < self.max_conns_per_endpoint {
+                pool_entries.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                total_conns += 1;
+                returned = true;
+                per_db_size = pool_entries.len();
+
+                pool.total_conns += 1;
+            }
+        }
+
+        // do logging outside of the mutex
+        if returned {
+            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
+    }
+
+    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let mut global_pool = self.global_pool.lock();
+        let pool = global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                Arc::new(Mutex::new(EndpointConnPool {
+                    pools: HashMap::new(),
+                    total_conns: 0,
+                }))
+            })
+            .clone();
+        let global_pool_size = global_pool.len();
+        drop(global_pool);
+
+        // log new global pool size
+        if created {
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+}
+
+//
+// Wake up the destination if needed. Code here is a bit involved because
+// we reuse the code from the usual proxy and we need to prepare few structures
+// that this code expects.
+//
+async fn connect_to_compute(
+    config: &config::ProxyConfig,
+    conn_info: &ConnInfo,
+) -> anyhow::Result<tokio_postgres::Client> {
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+
+    let credential_params = StartupMessageParams::new([
+        ("user", &conn_info.username),
+        ("database", &conn_info.dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| {
+            auth::ClientCredentials::parse(
+                &credential_params,
+                Some(&conn_info.hostname),
+                common_names,
+            )
+        })
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+
+    let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
+
+    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
+    // the difference that it uses `tokio_postgres` for the connection.
+    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    loop {
+        match connect_to_compute_once(node_info, conn_info).await {
+            Err(e) if num_retries > 0 => {
+                info!("compute node's state has changed; requesting a wake-up");
+                match creds.wake_compute(&extra).await? {
+                    // Update `node_info` and try one more time.
+                    Some(new) => {
+                        *node_info = new;
+                    }
+                    // Link auth doesn't work that way, so we just exit.
+                    None => return Err(e),
+                }
+            }
+            other => return other,
+        }
+
+        num_retries -= 1;
+        info!("retrying after wake-up ({num_retries} attempts left)");
+    }
+}
+
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    conn_info: &ConnInfo,
+) -> anyhow::Result<tokio_postgres::Client> {
+    let mut config = (*node_info.config).clone();
+
+    let (client, connection) = config
+        .user(&conn_info.username)
+        .password(&conn_info.password)
+        .dbname(&conn_info.dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .inspect_err(|e: &tokio_postgres::Error| {
+            error!(
+                "failed to connect to compute node hosts={:?} ports={:?}: {}",
+                node_info.config.get_hosts(),
+                node_info.config.get_ports(),
+                e
+            );
+            invalidate_cache(node_info)
+        })
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });
+
+    Ok(client)
+}
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,25 +1,21 @@
+use std::sync::Arc;
+
 use futures::pin_mut;
 use futures::StreamExt;
-use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
-use pq_proto::StartupMessageParams;
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::Row;
-use tracing::error;
-use tracing::info;
-use tracing::instrument;
 use url::Url;

-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
-use crate::{auth, config::ProxyConfig, console};
+use super::conn_pool::ConnInfo;
+use super::conn_pool::GlobalConnPool;

 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -27,12 +23,13 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

-const APP_NAME: &str = "sql_over_http";
-const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

 //
@@ -96,13 +93,6 @@ fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::E
    }
 }

-struct ConnInfo {
-    username: String,
-    dbname: String,
-    hostname: String,
-    password: String,
-}
-
 fn get_conn_info(
    headers: &HeaderMap,
    sni_hostname: Option<String>,
@@ -169,50 +159,23 @@ fn get_conn_info(

 // TODO: return different http error codes
 pub async fn handle(
-    config: &'static ProxyConfig,
    request: Request<Body>,
    sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
 ) -> anyhow::Result<Value> {
    //
    // Determine the destination and connection params
    //
    let headers = request.headers();
    let conn_info = get_conn_info(headers, sni_hostname)?;
-    let credential_params = StartupMessageParams::new([
-        ("user", &conn_info.username),
-        ("database", &conn_info.dbname),
-        ("application_name", APP_NAME),
-    ]);

    // Determine the output options. Default behaviour is 'false'. Anything that is not
    // strictly 'true' assumed to be false.
    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);

-    //
-    // Wake up the destination if needed. Code here is a bit involved because
-    // we reuse the code from the usual proxy and we need to prepare few structures
-    // that this code expects.
-    //
-    let tls = config.tls_config.as_ref();
-    let common_names = tls.and_then(|tls| tls.common_names.clone());
-    let creds = config
-        .auth_backend
-        .as_ref()
-        .map(|_| {
-            auth::ClientCredentials::parse(
-                &credential_params,
-                Some(&conn_info.hostname),
-                common_names,
-            )
-        })
-        .transpose()?;
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: Some(APP_NAME),
-    };
-
-    let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
+    // Allow connection pooling only if explicitly requested
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
@@ -235,7 +198,8 @@ pub async fn handle(
    //
    // Now execute the query and return the result
    //
-    let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
+    let client = conn_pool.get(&conn_info, !allow_pool).await?;
+
    let row_stream = client.query_raw_txt(query, query_params).await?;

    // Manually drain the stream into a vector to leave row_stream hanging
@@ -292,6 +256,13 @@ pub async fn handle(
        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
    // resulting JSON format is based on the format of node-postgres result
    Ok(json!({
        "command": command_tag_name,
@@ -302,70 +273,6 @@ pub async fn handle(
    }))
 }

-/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
-/// the difference that it uses `tokio_postgres` for the connection.
-#[instrument(skip_all)]
-async fn connect_to_compute(
-    node_info: &mut console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
-
-    loop {
-        match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
-                    }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
-                }
-            }
-            other => return other,
-        }
-
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
-    }
-}
-
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
-    let mut config = (*node_info.config).clone();
-
-    let (client, connection) = config
-        .user(&conn_info.username)
-        .password(&conn_info.password)
-        .dbname(&conn_info.dbname)
-        .max_backend_message_size(MAX_RESPONSE_SIZE)
-        .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
-        .await?;
-
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    Ok(client)
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -35,7 +35,7 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;

-use super::sql_over_http;
+use super::{conn_pool::GlobalConnPool, sql_over_http};

 pin_project! {
    /// This is a wrapper around a [`WebSocketStream`] that
@@ -164,6 +164,7 @@ async fn serve_websocket(
 async fn ws_handler(
    mut request: Request<Body>,
    config: &'static ProxyConfig,
+    conn_pool: Arc<GlobalConnPool>,
    cancel_map: Arc<CancelMap>,
    session_id: uuid::Uuid,
    sni_hostname: Option<String>,
@@ -192,7 +193,7 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(config, request, sni_hostname)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
            .instrument(info_span!("sql-over-http"))
            .await;
        let status_code = match result {
@@ -234,6 +235,8 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }

+    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
+
    let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
    let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
        Some(config) => config.into(),
@@ -258,15 +261,18 @@ pub async fn task_main(
    let make_svc =
        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+            let conn_pool = conn_pool.clone();

            async move {
                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
                    let sni_name = sni_name.clone();
+                    let conn_pool = conn_pool.clone();
+
                    async move {
                        let cancel_map = Arc::new(CancelMap::default());
                        let session_id = uuid::Uuid::new_v4();

-                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
                                session = format_args!("{session_id}")
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -18,7 +18,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
        .from_env_lossy();

    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_ansi(false)
        .with_writer(std::io::stderr)
        .with_target(false);

--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,11 +4,13 @@ use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tracing::{error, info, instrument, trace, warn};

 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

+const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+
 ///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -30,7 +32,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
        info!("metrics collector has shut down");
    }

-    let http_client = http::new_client();
+    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
    let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();

@@ -182,36 +184,36 @@ async fn collect_metrics_iteration(
            }
        };

-        if res.status().is_success() {
-            // update cached metrics after they were sent successfully
-            for send_metric in chunk {
-                let stop_time = match send_metric.kind {
-                    EventType::Incremental { stop_time, .. } => stop_time,
-                    _ => unreachable!(),
-                };
-
-                cached_metrics
-                    .entry(Ids {
-                        endpoint_id: send_metric.extra.endpoint_id.clone(),
-                        branch_id: send_metric.extra.branch_id.clone(),
-                    })
-                    // update cached value (add delta) and time
-                    .and_modify(|e| {
-                        e.0 = e.0.saturating_add(send_metric.value);
-                        e.1 = stop_time
-                    })
-                    // cache new metric
-                    .or_insert((send_metric.value, stop_time));
-            }
-        } else {
+        if !res.status().is_success() {
            error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter() {
+            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
                // Report if the metric value is suspiciously large
-                if metric.value > (1u64 << 40) {
-                    error!("potentially abnormal metric value: {:?}", metric);
-                }
+                error!("potentially abnormal metric value: {:?}", metric);
            }
        }
+        // update cached metrics after they were sent
+        // (to avoid sending the same metrics twice)
+        // see the relevant discussion on why to do so even if the status is not success:
+        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+        for send_metric in chunk {
+            let stop_time = match send_metric.kind {
+                EventType::Incremental { stop_time, .. } => stop_time,
+                _ => unreachable!(),
+            };
+
+            cached_metrics
+                .entry(Ids {
+                    endpoint_id: send_metric.extra.endpoint_id.clone(),
+                    branch_id: send_metric.extra.branch_id.clone(),
+                })
+                // update cached value (add delta) and time
+                .and_modify(|e| {
+                    e.0 = e.0.saturating_add(send_metric.value);
+                    e.1 = stop_time
+                })
+                // cache new metric
+                .or_insert((send_metric.value, stop_time));
+        }
    }
    Ok(())
 }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,7 +16,10 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio::{
+    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
+    time,
+};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
@@ -305,12 +308,13 @@ pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
 #[tracing::instrument(name = "connect_once", skip_all)]
 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
    let allow_self_signed_compute = node_info.allow_self_signed_compute;

    node_info
        .config
-        .connect(allow_self_signed_compute)
+        .connect(allow_self_signed_compute, timeout)
        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
        .await
 }
@@ -328,7 +332,27 @@ async fn connect_to_compute(
    loop {
        // Apply startup params to the (possibly, cached) compute node info.
        node_info.config.set_startup_params(params);
-        match connect_to_compute_once(node_info).await {
+
+        // Set a shorter timeout for the initial connection attempt.
+        //
+        // In case we try to connect to an outdated address that is no longer valid, the
+        // default behavior of Kubernetes is to drop the packets, causing us to wait for
+        // the entire timeout period. We want to fail fast in such cases.
+        //
+        // A specific case to consider is when we have cached compute node information
+        // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
+        // call, resulting in the nonexistence of the compute node.
+        //
+        // We only use caching in case of scram proxy backed by the console, so reduce
+        // the timeout only in that case.
+        let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
+        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+            time::Duration::from_secs(2)
+        } else {
+            time::Duration::from_secs(10)
+        };
+
+        match connect_to_compute_once(node_info, timeout).await {
            Err(e) if num_retries > 0 => {
                info!("compute node's state has changed; requesting a wake-up");
                match creds.wake_compute(extra).map_err(io_error).await? {
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -45,17 +45,74 @@ fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator<Item = &'a [u8]>) -> [u8
    let mut mac = Hmac::<Sha256>::new_from_slice(key).expect("bad key size");
    parts.into_iter().for_each(|s| mac.update(s));

-    // TODO: maybe newer `hmac` et al already migrated to regular arrays?
-    let mut result = [0u8; 32];
-    result.copy_from_slice(mac.finalize().into_bytes().as_slice());
-    result
+    mac.finalize().into_bytes().into()
 }

 fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
    let mut hasher = Sha256::new();
    parts.into_iter().for_each(|s| hasher.update(s));

-    let mut result = [0u8; 32];
-    result.copy_from_slice(hasher.finalize().as_slice());
-    result
+    hasher.finalize().into()
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::sasl::{Mechanism, Step};
+
+    use super::{password::SaltedPassword, Exchange, ServerSecret};
+
+    #[test]
+    fn happy_path() {
+        let iterations = 4096;
+        let salt_base64 = "QSXCR+Q6sek8bf92";
+        let pw = SaltedPassword::new(
+            b"pencil",
+            base64::decode(salt_base64).unwrap().as_slice(),
+            iterations,
+        );
+
+        let secret = ServerSecret {
+            iterations,
+            salt_base64: salt_base64.to_owned(),
+            stored_key: pw.client_key().sha256(),
+            server_key: pw.server_key(),
+            doomed: false,
+        };
+        const NONCE: [u8; 18] = [
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+        ];
+        let mut exchange = Exchange::new(&secret, || NONCE, None);
+
+        let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO";
+        let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0=";
+        let server_first =
+            "r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,s=QSXCR+Q6sek8bf92,i=4096";
+        let server_final = "v=qtUDIofVnIhM7tKn93EQUUt5vgMOldcDVu1HC+OH0o0=";
+
+        exchange = match exchange.exchange(client_first).unwrap() {
+            Step::Continue(exchange, message) => {
+                assert_eq!(message, server_first);
+                exchange
+            }
+            Step::Success(_, _) => panic!("expected continue, got success"),
+            Step::Failure(f) => panic!("{f}"),
+        };
+
+        let key = match exchange.exchange(client_final).unwrap() {
+            Step::Success(key, message) => {
+                assert_eq!(message, server_final);
+                key
+            }
+            Step::Continue(_, _) => panic!("expected success, got continue"),
+            Step::Failure(f) => panic!("{f}"),
+        };
+
+        assert_eq!(
+            key.as_bytes(),
+            [
+                74, 103, 1, 132, 12, 31, 200, 48, 28, 54, 82, 232, 207, 12, 138, 189, 40, 32, 134,
+                27, 125, 170, 232, 35, 171, 167, 166, 41, 70, 228, 182, 112,
+            ]
+        );
+    }
 }
--- a/proxy/src/scram/password.rs
+++ b/proxy/src/scram/password.rs
@@ -14,19 +14,7 @@ impl SaltedPassword {
    /// See `scram-common.c : scram_SaltedPassword` for details.
    /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
    pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        let one = 1_u32.to_be_bytes(); // magic
-
-        let mut current = super::hmac_sha256(password, [salt, &one]);
-        let mut result = current;
-        for _ in 1..iterations {
-            current = super::hmac_sha256(password, [current.as_ref()]);
-            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
-            for (i, x) in current.iter().enumerate() {
-                result[i] ^= x;
-            }
-        }
-
-        result.into()
+        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
    }

    /// Derive `ClientKey` from a salted hashed password.
@@ -46,3 +34,41 @@ impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
        Self { bytes }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::SaltedPassword;
+
+    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        let one = 1_u32.to_be_bytes(); // magic
+
+        let mut current = super::super::hmac_sha256(password, [salt, &one]);
+        let mut result = current;
+        for _ in 1..iterations {
+            current = super::super::hmac_sha256(password, [current.as_ref()]);
+            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
+            for (i, x) in current.iter().enumerate() {
+                result[i] ^= x;
+            }
+        }
+
+        result.into()
+    }
+
+    #[test]
+    fn pbkdf2() {
+        let password = "a-very-secure-password";
+        let salt = "such-a-random-salt";
+        let iterations = 4096;
+        let output = [
+            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
+            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
+        ];
+
+        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
+        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
+
+        assert_eq!(actual.bytes, output);
+        assert_eq!(actual.bytes, expected.bytes);
+    }
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ authors = []
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.3.1"
-psycopg2-binary = "^2.9.1"
+psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.31.0"
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.68.2"
+channel = "1.70.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -266,7 +266,7 @@ impl From<TimelineError> for ApiError {
    fn from(te: TimelineError) -> ApiError {
        match te {
            TimelineError::NotFound(ttid) => {
-                ApiError::NotFound(anyhow!("timeline {} not found", ttid))
+                ApiError::NotFound(anyhow!("timeline {} not found", ttid).into())
            }
            _ => ApiError::InternalServerError(anyhow!("{}", te)),
        }
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -144,17 +144,24 @@ const reportSummary = async (params) => {
                }
                summary += `- \`${testName}\`: ${links.join(", ")}\n`
            }
-
-            const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
-            const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
-
-            summary += "```\n"
-            summary += `# Run failed on Postgres ${pgVersion} tests locally:\n`
-            summary += `${command}\n`
-            summary += "```\n"
        }
    }

+    if (failedTestsCount > 0) {
+        const testsToRerun = []
+        for (const pgVersion of Object.keys(failedTests)) {
+            for (const testName of Object.keys(failedTests[pgVersion])) {
+                testsToRerun.push(...failedTests[pgVersion][testName].map(test => test.name))
+            }
+        }
+        const command = `scripts/pytest -vv -n $(nproc) -k "${testsToRerun.join(" or ")}"`
+
+        summary += "```\n"
+        summary += `# Run all failed tests locally:\n`
+        summary += `${command}\n`
+        summary += "```\n"
+    }
+
    if (flakyTestsCount > 0) {
        summary += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
        for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
@@ -164,8 +171,7 @@ const reportSummary = async (params) => {
                    const links = []
                    for (const test of tests) {
                        const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
-                        links.push(`[${status} ${test.buildType}](${allureLink})`)
+                        links.push(`[${test.buildType}](${allureLink})`)
                    }
                    summary += `- \`${testName}\`: ${links.join(", ")}\n`
                }
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,6 +1,6 @@
 pytest_plugins = (
    "fixtures.pg_version",
-    "fixtures.allure",
+    "fixtures.parametrize",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/allure.py
+++ b/test_runner/fixtures/allure.py
@@ -1,25 +0,0 @@
-import os
-
-import pytest
-
-from fixtures.pg_version import DEFAULT_VERSION, PgVersion
-
-"""
-Set of utilities to make Allure report more informative.
-
- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
-to make tests distinguishable in Allure report.
-"""
-
-
-@pytest.fixture(scope="function", autouse=True)
-def allure_noop():
-    pass
-
-
-def pytest_generate_tests(metafunc):
-    if "test_runner/regress" in metafunc.definition._nodeid:
-        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
-        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
-
-        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -59,9 +59,14 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "libmetrics_tracing_event_count_total",
    "pageserver_materialized_cache_hits_total",
    "pageserver_materialized_cache_hits_direct_total",
+    "pageserver_page_cache_read_hits_total",
+    "pageserver_page_cache_read_accesses_total",
+    "pageserver_page_cache_size_current_bytes",
+    "pageserver_page_cache_size_max_bytes",
    "pageserver_getpage_reconstruct_seconds_bucket",
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
+    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -102,8 +102,8 @@ def base_dir() -> Iterator[Path]:
    yield base_dir


-@pytest.fixture(scope="session")
-def neon_binpath(base_dir: Path) -> Iterator[Path]:
+@pytest.fixture(scope="function")
+def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]:
    if os.getenv("REMOTE_ENV"):
        # we are in remote env and do not have neon binaries locally
        # this is the case for benchmarks run on self-hosted runner
@@ -113,7 +113,6 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]:
    if env_neon_bin := os.environ.get("NEON_BIN"):
        binpath = Path(env_neon_bin)
    else:
-        build_type = os.environ.get("BUILD_TYPE", "debug")
        binpath = base_dir / "target" / build_type
    log.info(f"neon_binpath is {binpath}")

@@ -123,7 +122,7 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]:
    yield binpath


-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
    if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"):
        distrib_dir = Path(env_postgres_bin).resolve()
@@ -147,7 +146,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
    yield output_dir


-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
    versioned_dir = pg_distrib_dir / pg_version.v_prefixed

@@ -174,7 +173,23 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
    def myfixture(...)
       ...
    """
-    return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session"
+    scope: Literal["session", "function"]
+
+    if os.environ.get("TEST_SHARED_FIXTURES") is None:
+        # Create the environment in the per-test output directory
+        scope = "function"
+    elif (
+        os.environ.get("BUILD_TYPE") is not None
+        and os.environ.get("DEFAULT_PG_VERSION") is not None
+    ):
+        scope = "session"
+    else:
+        pytest.fail(
+            "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set",
+            pytrace=False,
+        )
+
+    return scope


@pytest.fixture(scope="session")
@@ -2415,6 +2430,17 @@ class Endpoint(PgProtocol):

        return self

+    def respec(self, **kwargs):
+        """Update the endpoint.json file used by control_plane."""
+        # Read config
+        config_path = os.path.join(self.endpoint_path(), "endpoint.json")
+        with open(config_path, "r") as f:
+            data_dict = json.load(f)
+
+        # Write it back updated
+        with open(config_path, "w") as file:
+            json.dump(dict(data_dict, **kwargs), file, indent=4)
+
    def stop(self) -> "Endpoint":
        """
        Stop the Postgres instance if it's running.
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -21,6 +21,18 @@ class PageserverApiException(Exception):
        self.status_code = status_code


+class TimelineCreate406(PageserverApiException):
+    def __init__(self, res: requests.Response):
+        assert res.status_code == 406
+        super().__init__(res.json()["msg"], res.status_code)
+
+
+class TimelineCreate409(PageserverApiException):
+    def __init__(self, res: requests.Response):
+        assert res.status_code == 409
+        super().__init__("", res.status_code)
+
+
@dataclass
 class InMemoryLayerInfo:
    kind: str
@@ -309,9 +321,12 @@ class PageserverHttpClient(requests.Session):
        res = self.post(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
        )
-        self.verbose_error(res)
        if res.status_code == 409:
-            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
+            raise TimelineCreate409(res)
+        if res.status_code == 406:
+            raise TimelineCreate406(res)
+
+        self.verbose_error(res)

        res_json = res.json()
        assert isinstance(res_json, dict)
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -0,0 +1,50 @@
+import os
+from typing import Optional
+
+import pytest
+from _pytest.fixtures import FixtureRequest
+from _pytest.python import Metafunc
+
+from fixtures.pg_version import PgVersion
+
+"""
+Dynamically parametrize tests by Postgres version and build type (debug/release/remote)
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pg_version(request: FixtureRequest) -> Optional[PgVersion]:
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in str(request.node.path):
+        v = os.environ.get("DEFAULT_PG_VERSION")
+        return PgVersion(v)
+
+    return None
+
+
+@pytest.fixture(scope="function", autouse=True)
+def build_type(request: FixtureRequest) -> Optional[str]:
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in str(request.node.path):
+        return os.environ.get("BUILD_TYPE", "").lower()
+
+    return None
+
+
+def pytest_generate_tests(metafunc: Metafunc):
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in metafunc.definition._nodeid:
+        return
+
+    if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
+        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
+    else:
+        pg_versions = [PgVersion(v)]
+
+    if (bt := os.environ.get("BUILD_TYPE")) is None:
+        build_types = ["debug", "release"]
+    else:
+        build_types = [bt.lower()]
+
+    metafunc.parametrize("build_type", build_types)
+    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,12 +1,10 @@
 import enum
 import os
-from typing import Iterator, Optional
+from typing import Optional

 import pytest
+from _pytest.config import Config
 from _pytest.config.argparsing import Parser
-from pytest import FixtureRequest
-
-from fixtures.log_helper import log

 """
 This fixture is used to determine which version of Postgres to use for tests.
@@ -75,18 +73,10 @@ def pytest_addoption(parser: Parser):
        "--pg-version",
        action="store",
        type=PgVersion,
-        help="Postgres version to use for tests",
+        help="DEPRECATED: Postgres version to use for tests",
    )


-@pytest.fixture(scope="session")
-def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
-    if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version command-line argument"
-    elif v := os.environ.get("DEFAULT_PG_VERSION"):
-        version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
-    else:
-        version, source = DEFAULT_VERSION, "default version"
-
-    log.info(f"pg_version is {version} ({source})")
-    yield version
+def pytest_configure(config: Config):
+    if config.getoption("--pg-version"):
+        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -32,13 +32,18 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc

    env.neon_cli.create_branch("test_startup")

+    endpoint = None
+
    # We do two iterations so we can see if the second startup is faster. It should
    # be because the compute node should already be configured with roles, databases,
    # extensions, etc from the first run.
    for i in range(2):
        # Start
        with zenbenchmark.record_duration(f"{i}_start_and_select"):
-            endpoint = env.endpoints.create_start("test_startup")
+            if endpoint:
+                endpoint.start()
+            else:
+                endpoint = env.endpoints.create_start("test_startup")
            endpoint.safe_psql("select 1;")

        # Get metrics
@@ -47,6 +52,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
            "wait_for_spec_ms": f"{i}_wait_for_spec",
            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
            "basebackup_ms": f"{i}_basebackup",
+            "start_postgres_ms": f"{i}_start_postgres",
            "config_ms": f"{i}_config",
            "total_startup_ms": f"{i}_total_startup",
        }
@@ -57,6 +63,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
        # Stop so we can restart
        endpoint.stop()

+        # Imitate optimizations that console would do for the second start
+        endpoint.respec(skip_pg_catalog_updates=True)
+

 # This test sometimes runs for longer than the global 5 minute timeout.
@pytest.mark.timeout(600)
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.29.4
+pg8000==1.29.8
 scramp>=1.4.3
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -396,9 +396,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"

 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
 "bitflags",
 "cfg-if",
@@ -428,9 +428,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
 "cc",
 "libc",
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.69
+FROM rust:1.70
 WORKDIR /source

 COPY . .
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -5,8 +5,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/vapor/postgres-nio.git",
      "state" : {
-        "revision" : "dbf9c2eb596df39cba8ff3f74d74b2e6a31bd937",
-        "version" : "1.14.1"
+        "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab",
+        "version" : "1.16.0"
      }
    },
    {
@@ -59,8 +59,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-nio.git",
      "state" : {
-        "revision" : "d1690f85419fdac8d54e350fb6d2ab9fd95afd75",
-        "version" : "2.51.1"
+        "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf",
+        "version" : "2.54.0"
      }
    },
    {
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -4,7 +4,7 @@ import PackageDescription
 let package = Package(
    name: "PostgresNIOExample",
    dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.14.1")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0")
    ],
    targets: [
        .executableTarget(
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,23 +5,7 @@
  "packages": {
    "": {
      "dependencies": {
-        "postgresql-client": "2.5.5"
-      }
-    },
-    "node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
-      "dependencies": {
-        "ms": "2.1.2"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
+        "postgresql-client": "2.5.9"
      }
    },
    "node_modules/doublylinked": {
@@ -41,11 +25,6 @@
        "putil-promisify": "^1.8.6"
      }
    },
-    "node_modules/ms": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
-    },
    "node_modules/obuf": {
      "version": "1.1.2",
      "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz",
@@ -63,30 +42,28 @@
      }
    },
    "node_modules/postgresql-client": {
-      "version": "2.5.5",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.5.tgz",
-      "integrity": "sha512-2Mu3i+6NQ9cnkoZNd0XeSZo9WoUpuWf4ZSiCCoDWSj82T93py2/SKXZ1aUaP8mVaU0oKpyyGe0IwLYZ1VHShnA==",
+      "version": "2.5.9",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz",
+      "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==",
      "dependencies": {
-        "debug": "^4.3.4",
        "doublylinked": "^2.5.2",
        "lightning-pool": "^4.2.1",
        "postgres-bytea": "^3.0.0",
-        "power-tasks": "^1.6.4",
+        "power-tasks": "^1.7.0",
        "putil-merge": "^3.10.3",
        "putil-promisify": "^1.10.0",
        "putil-varhelpers": "^1.6.5"
      },
      "engines": {
-        "node": ">=14.0",
+        "node": ">=16.0",
        "npm": ">=7.0.0"
      }
    },
    "node_modules/power-tasks": {
-      "version": "1.6.4",
-      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.4.tgz",
-      "integrity": "sha512-LX8GGgEIP1N7jsZqlqZ275e6f1Ehq97APCEGj8uVO0NoEoB+77QUX12BFv3LmlNKfq4fIuNSPiHhyHFjqn2gfA==",
+      "version": "1.7.0",
+      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz",
+      "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==",
      "dependencies": {
-        "debug": "^4.3.4",
        "doublylinked": "^2.5.2",
        "strict-typed-events": "^2.3.1"
      },
@@ -132,9 +109,9 @@
      }
    },
    "node_modules/ts-gems": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.3.0.tgz",
-      "integrity": "sha512-bUvrwrzlct7vfaNvtgMhynDf6lAki/kTtrNsIGhX6l7GJGK3s6b8Ro7dazOLXabV0m2jyShBzDQ8X1+h/C2Cug=="
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz",
+      "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A=="
    }
  }
 }
--- a/Show More
+++ b/Show More