Do vacuum freeze before copying data

Use cursor to copy data
Fix mapping of TOAST tables
2026-03-13 21:30:37 +00:00 · 2023-07-05 22:17:06 +03:00 · 2023-07-05 21:49:55 +03:00 · 2023-07-05 19:26:29 +03:00 · 2023-07-05 15:47:13 +03:00 · 2023-07-05 15:36:42 +03:00
160 changed files with 7453 additions and 3756 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -180,7 +180,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    timeout-minutes: 360 # 6h
+    # Increase timeout to 8h, default timeout is 6h
    timeout-minutes: 480
    steps:
    - uses: actions/checkout@v3
@@ -321,8 +322,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    timeout-minutes: 360 # 6h
    steps:
    - uses: actions/checkout@v3
@@ -414,8 +413,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    timeout-minutes: 360 # 6h
    steps:
    - uses: actions/checkout@v3
@@ -501,8 +498,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    timeout-minutes: 360 # 6h
    steps:
    - uses: actions/checkout@v3
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -264,7 +264,7 @@ jobs:
          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
      - name: Install rust binaries
        run: |
@@ -623,51 +623,6 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
  neon-image-depot:
    # For testing this will run side-by-side for a few merges.
    # This action is not really optimized yet, but gets the job done
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    permissions:
      contents: read
      id-token: write
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 0
      - name: Setup go
        uses: actions/setup-go@v3
        with:
          go-version: '1.19'
      - name: Set up Depot CLI
        uses: depot/setup-action@v1
      - name: Install Crane & ECR helper
        run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
      - name: Configure ECR login
        run: |
          mkdir /github/home/.docker/
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
      - name: Build and push
        uses: depot/build-push-action@v1
        with:
          # if no depot.json file is at the root of your repo, you must specify the project id
          project: nrdv0s4kcs
          push: true
          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
          build-args: |
            GIT_VERSION=${{ github.sha }}
            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
@@ -704,6 +659,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -761,10 +717,40 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup
      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
      # so we won't build extension twice, but extract them from compute-node.
      #
      # For now we use extensions image only for new custom extensitons
      - name: Kaniko build extensions only
        run: |
          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
          # it still fails with error:
          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
          #
          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
                           --context . \
                           --build-arg GIT_VERSION=${{ github.sha }} \
                           --build-arg PG_VERSION=${{ matrix.version }} \
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
                           --dockerfile Dockerfile.compute-node \
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
                           --cleanup \
                           --target postgres-extensions
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
@@ -781,7 +767,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.8.0
+      VM_BUILDER_VERSION: v0.11.1
    steps:
      - name: Checkout
@@ -883,8 +869,10 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
      - name: Push images to production ECR
        if: |
@@ -895,8 +883,10 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
      - name: Configure Docker Hub login
        run: |
@@ -918,16 +908,93 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
  upload-postgres-extensions-to-s3:
    if: |
      (github.ref_name == 'main' || github.ref_name == 'release') &&
       github.event_name != 'workflow_dispatch'
    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
    needs: [ tag, promote-images ]
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15 ]
    env:
      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
      # Later all the extensions will be moved to extensions image.
      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
      S3_BUCKETS: |
        ${{ github.ref_name == 'release' &&
          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
          docker pull ${COMPUTE_NODE_IMAGE}
      - name: Create postgres-extensions container
        id: create-container
        run: |
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT
          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
          echo "CID=${CID}" >> $GITHUB_OUTPUT
      - name: Extract postgres-extensions from container
        run: |
          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
          # In compute image we have a bit different directory layout
          mkdir -p extensions-to-upload/share
          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
          # Delete Neon extensitons (they always present on compute-node image)
          rm -rf ./extensions-to-upload/share/extension/neon*
          rm -rf ./extensions-to-upload/lib/neon*
          # Delete leftovers from the extension build step
          rm -rf ./extensions-to-upload/lib/pgxs
          rm -rf ./extensions-to-upload/lib/pkgconfig
          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
          for EXT_NAME in $(ls ./custom-extensions); do
            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
          done
      - name: Upload postgres-extensions to S3
        run: |
          for BUCKET in $(echo ${S3_BUCKETS}); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done
      - name: Cleanup
        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
        run: |
          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
+    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -959,6 +1026,20 @@ jobs:
            exit 1
          fi
      - name: Create git tag
        if: github.ref_name == 'release'
        uses: actions/github-script@v6
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
            github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })
  promote-compatibility-data:
    runs-on: [ self-hosted, gen3, small ]
    container:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,6 +3,7 @@ name: Create Release Branch
 on:
  schedule:
    - cron: '0 10 * * 2'
  workflow_dispatch:
 jobs:
  create_release_branch:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,17 +200,6 @@ dependencies = [
 "critical-section",
 ]
 [[package]]
 name = "atty"
 version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
 dependencies = [
 "hermit-abi 0.1.19",
 "libc",
 "winapi",
 ]
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -805,18 +794,6 @@ dependencies = [
 "libloading",
 ]
 [[package]]
 name = "clap"
 version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
 "bitflags",
 "clap_lex 0.2.4",
 "indexmap",
 "textwrap",
 ]
 [[package]]
 name = "clap"
 version = "4.3.0"
@@ -837,7 +814,7 @@ dependencies = [
 "anstream",
 "anstyle",
 "bitflags",
- "clap_lex 0.5.0",
+ "clap_lex",
 "strsim",
 ]
@@ -853,15 +830,6 @@ dependencies = [
 "syn 2.0.16",
 ]
 [[package]]
 name = "clap_lex"
 version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
 dependencies = [
 "os_str_bytes",
 ]
 [[package]]
 name = "clap_lex"
 version = "0.5.0"
@@ -915,7 +883,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "compute_api",
 "futures",
 "hyper",
@@ -977,7 +945,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.3.0",
+ "clap",
 "comfy-table",
 "compute_api",
 "git-version",
@@ -1047,19 +1015,19 @@ dependencies = [
 [[package]]
 name = "criterion"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
 dependencies = [
 "anes",
 "atty",
 "cast",
 "ciborium",
- "clap 3.2.25",
+ "clap",
 "criterion-plot",
 "is-terminal",
 "itertools",
 "lazy_static",
 "num-traits",
 "once_cell",
 "oorandom",
 "plotters",
 "rayon",
@@ -1140,7 +1108,7 @@ dependencies = [
 "crossterm_winapi",
 "libc",
 "mio",
- "parking_lot",
+ "parking_lot 0.12.1",
 "signal-hook",
 "signal-hook-mio",
 "winapi",
@@ -1210,7 +1178,7 @@ dependencies = [
 "hashbrown 0.12.3",
 "lock_api",
 "once_cell",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
 ]
 [[package]]
@@ -1676,15 +1644,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "hermit-abi"
 version = "0.2.6"
@@ -1939,6 +1898,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
 "cfg-if",
 "js-sys",
 "wasm-bindgen",
 "web-sys",
 ]
 [[package]]
@@ -2267,16 +2229,6 @@ dependencies = [
 "windows-sys 0.45.0",
 ]
 [[package]]
 name = "nu-ansi-term"
 version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
 dependencies = [
 "overload",
 "winapi",
 ]
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2349,9 +2301,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
 "bitflags",
 "cfg-if",
@@ -2381,9 +2333,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
 "cc",
 "libc",
@@ -2504,31 +2456,19 @@ dependencies = [
 "winapi",
 ]
 [[package]]
 name = "os_str_bytes"
 version = "6.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
 [[package]]
 name = "outref"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 [[package]]
 name = "overload"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "pagectl"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
- "clap 4.3.0",
+ "clap",
 "git-version",
 "pageserver",
 "postgres_ffi",
@@ -2547,7 +2487,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "close_fds",
 "const_format",
 "consumption_metrics",
@@ -2629,6 +2569,17 @@ dependencies = [
 "workspace_hack",
 ]
 [[package]]
 name = "parking_lot"
 version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
 dependencies = [
 "instant",
 "lock_api",
 "parking_lot_core 0.8.6",
 ]
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2636,7 +2587,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
 ]
 [[package]]
 name = "parking_lot_core"
 version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
 dependencies = [
 "cfg-if",
 "instant",
 "libc",
 "redox_syscall 0.2.16",
 "smallvec",
 "winapi",
 ]
 [[package]]
@@ -2652,6 +2617,16 @@ dependencies = [
 "windows-sys 0.45.0",
 ]
 [[package]]
 name = "pbkdf2"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
 ]
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -2770,7 +2745,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2783,7 +2758,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2794,7 +2769,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2812,7 +2787,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2957,7 +2932,7 @@ dependencies = [
 "lazy_static",
 "libc",
 "memchr",
- "parking_lot",
+ "parking_lot 0.12.1",
 "procfs",
 "thiserror",
 ]
@@ -3022,12 +2997,11 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
 "atty",
 "base64 0.13.1",
 "bstr",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "consumption_metrics",
 "futures",
 "git-version",
@@ -3045,7 +3019,8 @@ dependencies = [
 "native-tls",
 "once_cell",
 "opentelemetry",
- "parking_lot",
+ "parking_lot 0.12.1",
 "pbkdf2",
 "pin-project-lite",
 "postgres-native-tls",
 "postgres_backend",
@@ -3056,6 +3031,7 @@ dependencies = [
 "regex",
 "reqwest",
 "reqwest-middleware",
 "reqwest-retry",
 "reqwest-tracing",
 "routerify",
 "rstest",
@@ -3291,6 +3267,29 @@ dependencies = [
 "thiserror",
 ]
 [[package]]
 name = "reqwest-retry"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
 dependencies = [
 "anyhow",
 "async-trait",
 "chrono",
 "futures",
 "getrandom",
 "http",
 "hyper",
 "parking_lot 0.11.2",
 "reqwest",
 "reqwest-middleware",
 "retry-policies",
 "task-local-extensions",
 "tokio",
 "tracing",
 "wasm-timer",
 ]
 [[package]]
 name = "reqwest-tracing"
 version = "0.4.4"
@@ -3309,6 +3308,17 @@ dependencies = [
 "tracing-opentelemetry",
 ]
 [[package]]
 name = "retry-policies"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
 dependencies = [
 "anyhow",
 "chrono",
 "rand",
 ]
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3507,7 +3517,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "const_format",
 "crc32c",
 "fs2",
@@ -3518,7 +3528,7 @@ dependencies = [
 "hyper",
 "metrics",
 "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
 "postgres",
 "postgres-protocol",
 "postgres_backend",
@@ -3937,7 +3947,7 @@ dependencies = [
 "anyhow",
 "async-stream",
 "bytes",
- "clap 4.3.0",
+ "clap",
 "const_format",
 "futures",
 "futures-core",
@@ -3947,7 +3957,7 @@ dependencies = [
 "hyper",
 "metrics",
 "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
 "prost",
 "tokio",
 "tokio-stream",
@@ -4118,12 +4128,6 @@ dependencies = [
 "syn 1.0.109",
 ]
 [[package]]
 name = "textwrap"
 version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 [[package]]
 name = "thiserror"
 version = "1.0.40"
@@ -4272,7 +4276,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4281,7 +4285,7 @@ dependencies = [
 "futures-channel",
 "futures-util",
 "log",
- "parking_lot",
+ "parking_lot 0.12.1",
 "percent-encoding",
 "phf",
 "pin-project-lite",
@@ -4539,7 +4543,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.3.0",
+ "clap",
 "pageserver_api",
 "utils",
 "workspace_hack",
@@ -4641,7 +4645,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
 "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -4810,7 +4813,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
 "atty",
 "bincode",
 "byteorder",
 "bytes",
@@ -4887,7 +4889,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 4.3.0",
+ "clap",
 "env_logger",
 "log",
 "once_cell",
@@ -4991,6 +4993,21 @@ version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
 [[package]]
 name = "wasm-timer"
 version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
 dependencies = [
 "futures",
 "js-sys",
 "parking_lot 0.11.2",
 "pin-utils",
 "wasm-bindgen",
 "wasm-bindgen-futures",
 "web-sys",
 ]
 [[package]]
 name = "web-sys"
 version = "0.3.63"
@@ -5252,7 +5269,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "chrono",
- "clap 4.3.0",
+ "clap",
 "clap_builder",
 "crossbeam-utils",
 "either",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
@@ -87,6 +86,7 @@ opentelemetry = "0.18.0"
 opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
 pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
@@ -95,6 +95,7 @@ regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustls = "0.20"
@@ -128,7 +129,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -140,11 +141,11 @@ env_logger = "0.10"
 log = "0.4"
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 ## Other git libraries
@@ -170,7 +171,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 ## Build dependencies
-criterion = "0.4"
+criterion = "0.5.1"
 rcgen = "0.10"
 rstest = "0.17"
 tempfile = "3.4"
@@ -180,7 +181,7 @@ tonic-build = "0.9"
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -2,6 +2,7 @@ ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 #########################################################################################
 #
@@ -67,7 +68,7 @@ RUN apt update && \
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /
@@ -95,7 +96,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -188,8 +189,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
-    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,7 +356,7 @@ RUN apt-get update && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +411,7 @@ RUN apt-get update && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,6 +433,108 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
 #########################################################################################
 #
 # Layer "rdkit-pg-build"
 # compile rdkit extension
 #
 #########################################################################################
 FROM build-deps AS rdkit-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt-get update && \
    apt-get install -y \
        cmake \
        libboost-iostreams1.74-dev \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
        libboost-system1.74-dev \
        libeigen3-dev \
        libfreetype6-dev
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
        -D RDK_BUILD_INCHI_SUPPORT=ON \
        -D RDK_BUILD_AVALON_SUPPORT=ON \
        -D RDK_BUILD_PYTHON_WRAPPERS=OFF \
        -D RDK_BUILD_DESCRIPTORS3D=OFF \
        -D RDK_BUILD_FREESASA_SUPPORT=OFF \
        -D RDK_BUILD_COORDGEN_SUPPORT=ON \
        -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
        -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
        -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
        -D RDK_USE_URF=OFF \
        -D RDK_BUILD_PGSQL=ON \
        -D RDK_PGSQL_STATIC=ON \
        -D PostgreSQL_CONFIG=pg_config \
        -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
        -D RDK_INSTALL_INTREE=OFF \
        -D CMAKE_BUILD_TYPE=Release \
        . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
 #########################################################################################
 #
 # Layer "pg-uuidv7-pg-build"
 # compile pg_uuidv7 extension
 #
 #########################################################################################
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
 #########################################################################################
 #
 # Layer "pg-roaringbitmap-pg-build"
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
 # compile anon extension
 #
 #########################################################################################
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sort  > /before.txt && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
    find /usr/local/pgsql -type f | sort  > /after.txt && \
    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -540,6 +643,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -564,6 +668,9 @@ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -585,6 +692,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
@@ -609,6 +719,22 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 #########################################################################################
 #
 # Extenstion only
 #
 #########################################################################################
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
 # As for now, it's only for new custom ones
 #
 # # Default extensions
 # COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
 # COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
 # Custom extensions
 COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
 COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
 #########################################################################################
 #
 # Final layer
@@ -637,14 +763,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
        locales \
        libicu67 \
        liblz4-1 \
        libreadline8 \
        libboost-iostreams1.74.0 \
        libboost-regex1.74.0 \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
        libossp-uuid16 \
        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -654,7 +785,9 @@ RUN apt update &&  \
        libxslt1.1 \
        libzstd1 \
        libcurl4-openssl-dev \
-        procps && \
+        locales \
        procps \
        zlib1g && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -3,6 +3,7 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
 WORKDIR /home/nonroot
@@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
 ARG CACHEPOT_BUCKET=neon-github-dev
 #ARG AWS_ACCESS_KEY_ID
 #ARG AWS_SECRET_ACCESS_KEY
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG
 COPY . .
--- a/README.md
+++ b/README.md
@@ -132,13 +132,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > cargo neon init
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"
 # start pageserver, safekeeper, and broker for their intercommunication
 > cargo neon start
-Starting neon broker at 127.0.0.1:50051
+Starting neon broker at 127.0.0.1:50051.
 storage_broker started, pid: 2918372
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
 pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
@@ -152,8 +152,7 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
 Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
 # check list of running postgres instances
 > cargo neon endpoint list
@@ -189,18 +188,17 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 # start postgres on that branch
 > cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
 Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
 # check the new list of running postgres instances
 > cargo neon endpoint list
 ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
 main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
- migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
+ migration_check  127.0.0.1:55434  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
 key | value
 -----+-------
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -54,9 +54,15 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 const BUILD_TAG_DEFAULT: &str = "local";
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
    info!("build_tag: {build_tag}");
    let matches = cli().get_matches();
    let http_port = *matches
@@ -250,6 +256,16 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }
    // Maybe sync safekeepers again, to speed up next startup
    let compute_state = compute.state.lock().unwrap().clone();
    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
        info!("syncing safekeepers on shutdown");
        let storage_auth_token = pspec.storage_auth_token.clone();
        let lsn = compute.sync_safekeepers(storage_auth_token)?;
        info!("synced safekeepers at lsn {lsn}");
    }
    if let Err(err) = compute.check_for_core_dumps() {
        error!("error while checking for core dumps: {err:?}");
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -133,6 +133,84 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let roles = spec
        .cluster
        .roles
        .iter()
        .map(|r| format!("'{}'", escape_literal(&r.name)))
        .collect::<Vec<_>>();
    let dbs = spec
        .cluster
        .databases
        .iter()
        .map(|db| format!("'{}'", escape_literal(&db.name)))
        .collect::<Vec<_>>();
    let roles_decl = if roles.is_empty() {
        String::from("roles text[] := NULL;")
    } else {
        format!(
            r#"
               roles text[] := ARRAY(SELECT rolname
                                     FROM pg_catalog.pg_roles
                                     WHERE rolname IN ({}));"#,
            roles.join(", ")
        )
    };
    let database_decl = if dbs.is_empty() {
        String::from("dbs text[] := NULL;")
    } else {
        format!(
            r#"
               dbs text[] := ARRAY(SELECT datname
                                   FROM pg_catalog.pg_database
                                   WHERE datname IN ({}));"#,
            dbs.join(", ")
        )
    };
    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
    let query = format!(
        r#"
            DO $$
                DECLARE
                    r text;
                    {}
                    {}
                BEGIN
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
                            FOREACH r IN ARRAY roles LOOP
                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
                            END LOOP;
                        END IF;
                        IF array_length(dbs, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
                        END IF;
                    END IF;
                END
            $$;"#,
        roles_decl, database_decl,
    );
    info!("Neon superuser created:\n{}", &query);
    client
        .simple_query(&query)
        .map_err(|e| anyhow::anyhow!(e).context(query))?;
    Ok(())
 }
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -157,7 +235,7 @@ impl ComputeNode {
    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Utc::now();
@@ -199,8 +277,8 @@ impl ComputeNode {
    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
-    #[instrument(skip(self, storage_auth_token))]
+    #[instrument(skip_all)]
-    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
+    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();
        let sync_handle = Command::new(&self.pgbin)
@@ -244,7 +322,7 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all)]
    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
@@ -302,7 +380,7 @@ impl ComputeNode {
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
@@ -326,7 +404,7 @@ impl ComputeNode {
    }
    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all)]
    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
@@ -347,6 +425,8 @@ impl ComputeNode {
                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
                // Disable forwarding so that users don't get a cloud_admin role
                client.simple_query("SET neon.forward_ddl = false")?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);
@@ -357,31 +437,28 @@ impl ComputeNode {
            Ok(client) => client,
        };
        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        client.simple_query("SET neon.forward_ddl = false")?;
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        // 'Close' connection
        drop(client);
        info!(
            "finished configuration of compute for project {}",
            spec.cluster.cluster_id.as_deref().unwrap_or("None")
        );
        Ok(())
    }
    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
-    #[instrument(skip(self, client))]
+    #[instrument(skip_all)]
    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
@@ -389,7 +466,7 @@ impl ComputeNode {
    /// Similar to `apply_config()`, but does a bit different sequence of operations,
    /// as it's used to reconfigure a previously started and configured Postgres node.
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
@@ -407,7 +484,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(&spec, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }
@@ -424,36 +501,41 @@ impl ComputeNode {
        Ok(())
    }
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
+            pspec.tenant_id,
-            spec.timeline_id,
+            pspec.timeline_id,
        );
        self.prepare_pgdata(&compute_state)?;
        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let config_time = Utc::now();
-
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
        if spec.spec.mode == ComputeMode::Primary {
            self.apply_config(&compute_state)?;
        }
        let startup_end_time = Utc::now();
        {
            let mut state = self.state.lock().unwrap();
-            state.metrics.config_ms = startup_end_time
+            state.metrics.start_postgres_ms = config_time
                .signed_duration_since(start_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
            state.metrics.config_ms = startup_end_time
                .signed_duration_since(config_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
            state.metrics.total_startup_ms = startup_end_time
                .signed_duration_since(compute_state.start_time)
                .to_std()
@@ -462,6 +544,11 @@ impl ComputeNode {
        }
        self.set_status(ComputeStatus::Running);
        info!(
            "finished configuration of compute for project {}",
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
        );
        Ok(pg)
    }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;
 use crate::compute::ComputeNode;
-#[instrument(skip(compute))]
+#[instrument(skip_all)]
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -18,6 +18,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
    let fmt_layer = tracing_subscriber::fmt::layer()
        .with_ansi(false)
        .with_target(false)
        .with_writer(std::io::stderr);
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -17,7 +17,7 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 /// Escape a string for including it in a SQL literal
-fn escape_literal(s: &str) -> String {
+pub fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }
@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
-#[instrument(skip(pg))]
+#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    let pid_path = pgdata.join("postmaster.pid");
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -269,17 +269,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                let mut query: String = format!(
                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
                let grant_query = format!(
                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
                    name.pg_quote()
                );
                xact.execute(grant_query.as_str(), &[])?;
                info!("role grant query: '{}'", &grant_query);
            }
        }
@@ -476,6 +472,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                query.push_str(&db.to_pg_options());
                let _guard = info_span!("executing", query).entered();
                client.execute(query.as_str(), &[])?;
                let grant_query: String = format!(
                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
                    name.pg_quote()
                );
                client.execute(grant_query.as_str(), &[])?;
            }
        };
@@ -495,35 +496,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
    info!("cluster spec grants:");
    // We now have a separate `web_access` role to connect to the database
    // via the web interface and proxy link auth. And also we grant a
    // read / write all data privilege to every role. So also grant
    // create to everyone.
    // XXX: later we should stop messing with Postgres ACL in such horrible
    // ways.
    let roles = spec
        .cluster
        .roles
        .iter()
        .map(|r| r.name.pg_quote())
        .collect::<Vec<_>>();
    for db in &spec.cluster.databases {
        let dbname = &db.name;
        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
            dbname.pg_quote(),
            roles.join(", ")
        );
        info!("grant query {}", &query);
        client.execute(query.as_str(), &[])?;
    }
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -180,6 +180,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    }
    // Wait until process is gone
    wait_until_stopped(process_name, pid)?;
    Ok(())
 }
 pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -308,7 +308,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    env.init(pg_version)
+    let force = init_match.get_flag("force");
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;
    // Initialize pageserver, create initial tenant and timeline.
@@ -1013,6 +1014,13 @@ fn cli() -> Command {
        .help("If set, the node will be a hot replica on the specified timeline")
        .required(false);
    let force_arg = Arg::new("force")
        .value_parser(value_parser!(bool))
        .long("force")
        .action(ArgAction::SetTrue)
        .help("Force initialization even if the repository is not empty")
        .required(false);
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1028,6 +1036,7 @@ fn cli() -> Command {
                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
                .arg(force_arg)
        )
        .subcommand(
            Command::new("timeline")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -67,6 +67,7 @@ pub struct EndpointConf {
    pg_port: u16,
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
 }
 //
@@ -135,6 +136,7 @@ impl ComputeControlPlane {
            mode,
            tenant_id,
            pg_version,
            skip_pg_catalog_updates: false,
        });
        ep.create_endpoint_dir()?;
@@ -148,6 +150,7 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
                skip_pg_catalog_updates: false,
            })?,
        )?;
        std::fs::write(
@@ -183,6 +186,9 @@ pub struct Endpoint {
    // the endpoint runs in.
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
    // Optimizations
    skip_pg_catalog_updates: bool,
 }
 impl Endpoint {
@@ -216,6 +222,7 @@ impl Endpoint {
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
        })
    }
@@ -398,6 +405,16 @@ impl Endpoint {
                String::from_utf8_lossy(&pg_ctl.stderr),
            );
        }
        // Also wait for the compute_ctl process to die. It might have some cleanup
        // work to do after postgres stops, like syncing safekeepers, etc.
        //
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
        Ok(())
    }
@@ -450,6 +467,7 @@ impl Endpoint {
        // Create spec file
        let spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
            cluster: Cluster {
@@ -499,7 +517,13 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
-        let _child = cmd.spawn()?;
+        let child = cmd.spawn()?;
        // Write down the pid so we can wait for it when we want to stop
        // TODO use background_process::start_process instead
        let pid = child.id();
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        std::fs::write(pidfile_path, pid.to_string())?;
        // Wait for it to start
        let mut attempt = 0;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -364,7 +364,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -372,11 +372,29 @@ impl LocalEnv {
            "repository base path is missing"
        );
-        ensure!(
+        if base_path.exists() {
-            !base_path.exists(),
+            if force {
-            "directory '{}' already exists. Perhaps already initialized?",
+                println!("removing all contents of '{}'", base_path.display());
-            base_path.display()
+                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-        );
+                // all contents inside. This helps if the developer symbol links another directory (i.e.,
                // S3 local SSD) to the `.neon` base directory.
                for entry in std::fs::read_dir(base_path)? {
                    let entry = entry?;
                    let path = entry.path();
                    if path.is_dir() {
                        fs::remove_dir_all(&path)?;
                    } else {
                        fs::remove_file(&path)?;
                    }
                }
            } else {
                bail!(
                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
                    base_path.display()
                );
            }
        }
        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
@@ -392,7 +410,9 @@ impl LocalEnv {
            }
        }
-        fs::create_dir(base_path)?;
+        if !base_path.exists() {
            fs::create_dir(base_path)?;
        }
        // Generate keypair for JWT.
        //
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
    pub basebackup_ms: u64,
    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
 }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -27,6 +27,12 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
    /// An optinal hint that can be passed to speed up startup time if we know
    /// that no pg catalog mutations (like role creation, database creation,
    /// extension creation) need to be done on the actual database to start.
    #[serde(default)] // Default false
    pub skip_pg_catalog_updates: bool,
    // Information needed to connect to the storage layer.
    //
    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
@@ -142,4 +148,14 @@ mod tests {
        let file = File::open("tests/cluster_spec.json").unwrap();
        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }
    #[test]
    fn parse_unknown_fields() {
        // Forward compatibility test
        let file = File::open("tests/cluster_spec.json").unwrap();
        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
        let ob = json.as_object_mut().unwrap();
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -23,6 +23,7 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 pub mod metric_vec_duration;
 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -0,0 +1,23 @@
 //! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
 use std::{future::Future, time::Instant};
 pub trait DurationResultObserver {
    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
 }
 pub async fn observe_async_block_duration_by_result<
    T,
    E,
    F: Future<Output = Result<T, E>>,
    O: DurationResultObserver,
 >(
    observer: &O,
    block: F,
 ) -> Result<T, E> {
    let start = Instant::now();
    let result = block.await;
    let duration = start.elapsed();
    observer.observe_result(&result, duration);
    result
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -152,7 +152,7 @@ pub enum ActivatingFrom {
 }
 /// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
    /// The timeline is recognized by the pageserver but is not yet operational.
    /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -165,7 +165,7 @@ pub enum TimelineState {
    /// It cannot transition back into any other state.
    Stopping,
    /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken,
+    Broken { reason: String, backtrace: String },
 }
 #[serde_as]
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,6 +70,14 @@ impl RemotePath {
    pub fn join(&self, segment: &Path) -> Self {
        Self(self.0.join(segment))
    }
    pub fn get_path(&self) -> &PathBuf {
        &self.0
    }
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
 }
 /// Storage (potentially remote) API to manage its state.
@@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError>;
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
    /// because it is for listing files instead of listing
    /// names sharing common prefixes.
    /// For example,
    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
@@ -111,6 +132,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<Download, DownloadError>;
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }
 pub struct Download {
@@ -172,6 +195,14 @@ impl GenericRemoteStorage {
        }
    }
    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -223,6 +254,14 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
 }
 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe::path_with_suffix_extension;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 use crate::{Download, DownloadError, RemotePath};
@@ -48,6 +48,14 @@ impl LocalFs {
        Ok(Self { storage_root })
    }
    // mirrors S3Bucket::s3_object_to_relative_path
    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
        let relative_path = key
            .strip_prefix(&self.storage_root)
            .expect("relative path must contain storage_root as prefix");
        RemotePath(relative_path.into())
    }
    async fn read_storage_metadata(
        &self,
        file_path: &Path,
@@ -101,19 +109,63 @@ impl RemoteStorage for LocalFs {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
-        Ok(get_all_files(path.as_ref(), false)
+
        let prefixes_to_filter = get_all_files(path.as_ref(), false)
            .await
-            .map_err(DownloadError::Other)?
+            .map_err(DownloadError::Other)?;
-            .into_iter()
+
-            .map(|path| {
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
-                path.strip_prefix(&self.storage_root)
+
-                    .context("Failed to strip preifix")
+        // filter out empty directories to mirror s3 behavior.
        for prefix in prefixes_to_filter {
            if prefix.is_dir()
                && is_directory_empty(&prefix)
                    .await
                    .map_err(DownloadError::Other)?
            {
                continue;
            }
            prefixes.push(
                prefix
                    .strip_prefix(&self.storage_root)
                    .context("Failed to strip prefix")
                    .and_then(RemotePath::new)
                    .expect(
                        "We list files for storage root, hence should be able to remote the prefix",
-                    )
+                    ),
-            })
+            )
-            .collect())
+        }
        Ok(prefixes)
    }
    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
        };
        let mut files = vec![];
        let mut directory_queue = vec![full_path.clone()];
        while !directory_queue.is_empty() {
            let cur_folder = directory_queue
                .pop()
                .expect("queue cannot be empty: we just checked");
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
                let full_file_name = cur_folder.clone().join(&file_name);
                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
                files.push(file_remote_path.clone());
                if full_file_name.is_dir() {
                    directory_queue.push(full_file_name);
                }
            }
        }
        Ok(files)
    }
    async fn upload(
@@ -291,11 +343,25 @@ impl RemoteStorage for LocalFs {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        if file_path.exists() && file_path.is_file() {
+        if !file_path.exists() {
-            Ok(fs::remove_file(file_path).await?)
+            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
-        } else {
+            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
-            bail!("File {file_path:?} either does not exist or is not a file")
+            return Ok(());
        }
        if !file_path.is_file() {
            anyhow::bail!("{file_path:?} is not a file");
        }
        Ok(fs::remove_file(file_path)
            .await
            .map_err(|e| anyhow::anyhow!(e))?)
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        for path in paths {
            self.delete(path).await?
        }
        Ok(())
    }
 }
@@ -320,7 +386,7 @@ where
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} us a symlink, skipping")
+                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -595,15 +661,11 @@ mod fs_tests {
        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());
-        match storage.delete(&upload_target).await {
+        storage
-            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            .delete(&upload_target)
-            Err(e) => {
+            .await
-                let error_string = e.to_string();
+            .expect("Should allow deleting non-existing storage files");
-                assert!(error_string.contains("does not exist"));
+
                let expected_path = upload_target.with_base(&storage.storage_root);
                assert!(error_string.contains(expected_path.to_str().unwrap()));
            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -17,6 +17,7 @@ use aws_sdk_s3::{
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_http::body::SdkBody;
@@ -33,6 +34,8 @@ use crate::{
    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
 pub(super) mod metrics {
    use metrics::{register_int_counter_vec, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -81,12 +84,24 @@ pub(super) mod metrics {
            .inc();
    }
    pub fn inc_delete_objects(count: u64) {
        S3_REQUESTS_COUNT
            .with_label_values(&["delete_object"])
            .inc_by(count);
    }
    pub fn inc_delete_object_fail() {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc();
    }
    pub fn inc_delete_objects_fail(count: u64) {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc_by(count);
    }
    pub fn inc_list_objects() {
        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
    }
@@ -332,6 +347,51 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }
    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
            let _guard = self
                .concurrency_limiter
                .acquire()
                .await
                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
            metrics::inc_list_objects();
            let response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(folder_name.clone())
                .set_continuation_token(continuation_token)
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
                .map_err(|e| {
                    metrics::inc_list_objects_fail();
                    e
                })
                .context("Failed to list files in S3 bucket")?;
            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
                let remote_path = self.s3_object_to_relative_path(object_path);
                all_files.push(remote_path);
            }
            match response.next_continuation_token {
                Some(new_token) => continuation_token = Some(new_token),
                None => break,
            }
        }
        Ok(all_files)
    }
    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -396,6 +456,50 @@ impl RemoteStorage for S3Bucket {
        })
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        let _guard = self
            .concurrency_limiter
            .acquire()
            .await
            .context("Concurrency limiter semaphore got closed during S3 delete")?;
        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
                .build();
            delete_objects.push(obj_id);
        }
        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
            metrics::inc_delete_objects(chunk.len() as u64);
            let resp = self
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
                .send()
                .await;
            match resp {
                Ok(resp) => {
                    if let Some(errors) = resp.errors {
                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
                        ));
                    }
                }
                Err(e) => {
                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
        }
        Ok(())
    }
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -24,6 +24,7 @@ enum RemoteOp {
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
    DeleteObjects(Vec<RemotePath>),
 }
 impl UnreliableWrapper {
@@ -82,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_prefixes(prefix).await
    }
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
        self.inner.list_files(folder).await
    }
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -119,4 +125,21 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Delete(path.clone()))?;
        self.inner.delete(path).await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
            if (self.delete(path).await).is_err() {
                error_counter += 1;
            }
        }
        if error_counter > 0 {
            return Err(anyhow::anyhow!(
                "failed to delete {} objects",
                error_counter
            ));
        }
        Ok(())
    }
 }
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -1,274 +0,0 @@
 use std::collections::HashSet;
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 use anyhow::Context;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
 /// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
 ///
 /// Then, verifies that the client does return correct prefixes when queried:
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
 /// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
 /// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
 /// since current default AWS S3 pagination limit is 1000.
 /// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3::Enabled(ctx) => ctx,
        MaybeEnabledS3::Disabled => return Ok(()),
        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };
    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();
    let base_prefix =
        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
        .await
        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
    );
    let nested_remote_prefixes = test_client
        .list_prefixes(Some(&base_prefix))
        .await
        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
        .difference(&expected_remote_prefixes)
        .collect::<HashSet<_>>();
    let missing_uploaded_prefixes = expected_remote_prefixes
        .difference(&nested_remote_prefixes)
        .collect::<HashSet<_>>();
    assert_eq!(
        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );
    Ok(())
 }
 enum MaybeEnabledS3 {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }
 struct S3WithTestBlobs {
    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
    base_prefix_str: &'static str,
    remote_prefixes: HashSet<RemotePath>,
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
        )
        .expect("logging init failed");
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
            .context("S3 client creation")
            .expect("S3 client creation failed");
        let base_prefix_str = "test/";
        match upload_s3_data(
            &client_with_excessive_pagination,
            base_prefix_str,
            upload_tasks_count,
        )
        .await
        {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
                Self::Enabled(S3WithTestBlobs {
                    client_with_excessive_pagination,
                    base_prefix_str,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                })
            }
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithTestBlobs {
                    client_with_excessive_pagination,
                    base_prefix_str,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                },
            ),
        }
    }
    async fn teardown(self) {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
            }
        }
    }
 }
 fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
        .as_millis();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response: Some(max_keys_per_list_response),
        }),
    };
    Ok(Arc::new(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
 struct Uploads {
    prefixes: HashSet<RemotePath>,
    blobs: HashSet<RemotePath>,
 }
 async fn upload_s3_data(
    client: &Arc<GenericRemoteStorage>,
    base_prefix_str: &'static str,
    upload_tasks_count: usize,
 ) -> ControlFlow<Uploads, Uploads> {
    info!("Creating {upload_tasks_count} S3 files");
    let mut upload_tasks = JoinSet::new();
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
            let blob_prefix = RemotePath::new(&prefix)
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
    }
    let mut upload_tasks_failed = false;
    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
    while let Some(task_run_result) = upload_tasks.join_next().await {
        match task_run_result
            .context("task join failed")
            .and_then(|task_result| task_result.context("upload task failed"))
        {
            Ok((upload_prefix, upload_path)) => {
                uploaded_prefixes.insert(upload_prefix);
                uploaded_blobs.insert(upload_path);
            }
            Err(e) => {
                error!("Upload task failed: {e:?}");
                upload_tasks_failed = true;
            }
        }
    }
    let uploads = Uploads {
        prefixes: uploaded_prefixes,
        blobs: uploaded_blobs,
    };
    if upload_tasks_failed {
        ControlFlow::Break(uploads)
    } else {
        ControlFlow::Continue(uploads)
    }
 }
 async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
    info!(
        "Removing {} objects from the remote storage during cleanup",
        objects_to_delete.len()
    );
    let mut delete_tasks = JoinSet::new();
    for object_to_delete in objects_to_delete {
        let task_client = Arc::clone(client);
        delete_tasks.spawn(async move {
            debug!("Deleting remote item at path {object_to_delete:?}");
            task_client
                .delete(&object_to_delete)
                .await
                .with_context(|| format!("{object_to_delete:?} removal"))
        });
    }
    while let Some(task_run_result) = delete_tasks.join_next().await {
        match task_run_result {
            Ok(task_result) => match task_result {
                Ok(()) => {}
                Err(e) => error!("Delete task failed: {e:?}"),
            },
            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
        }
    }
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -0,0 +1,542 @@
 use std::collections::HashSet;
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 use anyhow::Context;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};
 static LOGGING_DONE: OnceCell<()> = OnceCell::new();
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 const BASE_PREFIX: &str = "test/";
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
 /// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
 ///
 /// Then, verifies that the client does return correct prefixes when queried:
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
 /// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
 /// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
 /// since current default AWS S3 pagination limit is 1000.
 /// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
 #[test_context(MaybeEnabledS3WithTestBlobs)]
 #[tokio::test]
 async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();
    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
        .await
        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
    );
    let nested_remote_prefixes = test_client
        .list_prefixes(Some(&base_prefix))
        .await
        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
        .difference(&expected_remote_prefixes)
        .collect::<HashSet<_>>();
    let missing_uploaded_prefixes = expected_remote_prefixes
        .difference(&nested_remote_prefixes)
        .collect::<HashSet<_>>();
    assert_eq!(
        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );
    Ok(())
 }
 /// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `s3_pagination_should_work` for more information.
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
 #[tokio::test]
 async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
            anyhow::bail!("S3 init failed: {e:?}")
        }
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let base_prefix =
        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
        .list_files(None)
        .await
        .context("client list root files failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
        "remote storage list_files on root mismatches with the uploads."
    );
    let nested_remote_files = test_client
        .list_files(Some(&base_prefix))
        .await
        .context("client list nested files failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
        .remote_blobs
        .iter()
        .map(|x| x.get_path().to_str().expect("must be valid name"))
        .filter(|x| x.starts_with("folder1"))
        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
        "remote storage list_files on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3::Enabled(ctx) => ctx,
        MaybeEnabledS3::Disabled => return Ok(()),
    };
    let path = RemotePath::new(&PathBuf::from(format!(
        "{}/for_sure_there_is_nothing_there_really",
        ctx.base_prefix,
    )))
    .with_context(|| "RemotePath conversion")?;
    ctx.client.delete(&path).await.expect("should succeed");
    Ok(())
 }
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3::Enabled(ctx) => ctx,
        MaybeEnabledS3::Disabled => return Ok(()),
    };
    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;
    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;
    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;
    let data1 = "remote blob data1".as_bytes();
    let data1_len = data1.len();
    let data2 = "remote blob data2".as_bytes();
    let data2_len = data2.len();
    let data3 = "remote blob data3".as_bytes();
    let data3_len = data3.len();
    ctx.client
        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
        .await?;
    ctx.client
        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
        .await?;
    ctx.client
        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
        .await?;
    ctx.client.delete_objects(&[path1, path2]).await?;
    let prefixes = ctx.client.list_prefixes(None).await?;
    assert_eq!(prefixes.len(), 1);
    ctx.client.delete_objects(&[path3]).await?;
    Ok(())
 }
 fn ensure_logging_ready() {
    LOGGING_DONE.get_or_init(|| {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
        )
        .expect("logging init failed");
    });
 }
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
 }
 impl EnabledS3 {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_s3_client(max_keys_in_list_response)
            .context("S3 client creation")
            .expect("S3 client creation failed");
        EnabledS3 {
            client,
            base_prefix: BASE_PREFIX,
        }
    }
 }
 enum MaybeEnabledS3 {
    Enabled(EnabledS3),
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        Self::Enabled(EnabledS3::setup(None).await)
    }
 }
 enum MaybeEnabledS3WithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }
 struct S3WithTestBlobs {
    enabled: EnabledS3,
    remote_prefixes: HashSet<RemotePath>,
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
                Self::Enabled(S3WithTestBlobs {
                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                })
            }
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithTestBlobs {
                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                },
            ),
        }
    }
    async fn teardown(self) {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
            }
        }
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledS3WithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
 }
 struct S3WithSimpleTestBlobs {
    enabled: EnabledS3,
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
                Self::Enabled(S3WithSimpleTestBlobs {
                    enabled,
                    remote_blobs: uploads,
                })
            }
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithSimpleTestBlobs {
                    enabled,
                    remote_blobs: uploads,
                },
            ),
        }
    }
    async fn teardown(self) {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
            }
        }
    }
 }
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
        .as_nanos();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
        }),
    };
    Ok(Arc::new(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
 struct Uploads {
    prefixes: HashSet<RemotePath>,
    blobs: HashSet<RemotePath>,
 }
 async fn upload_s3_data(
    client: &Arc<GenericRemoteStorage>,
    base_prefix_str: &'static str,
    upload_tasks_count: usize,
 ) -> ControlFlow<Uploads, Uploads> {
    info!("Creating {upload_tasks_count} S3 files");
    let mut upload_tasks = JoinSet::new();
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
            let blob_prefix = RemotePath::new(&prefix)
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
    }
    let mut upload_tasks_failed = false;
    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
    while let Some(task_run_result) = upload_tasks.join_next().await {
        match task_run_result
            .context("task join failed")
            .and_then(|task_result| task_result.context("upload task failed"))
        {
            Ok((upload_prefix, upload_path)) => {
                uploaded_prefixes.insert(upload_prefix);
                uploaded_blobs.insert(upload_path);
            }
            Err(e) => {
                error!("Upload task failed: {e:?}");
                upload_tasks_failed = true;
            }
        }
    }
    let uploads = Uploads {
        prefixes: uploaded_prefixes,
        blobs: uploaded_blobs,
    };
    if upload_tasks_failed {
        ControlFlow::Break(uploads)
    } else {
        ControlFlow::Continue(uploads)
    }
 }
 async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
    info!(
        "Removing {} objects from the remote storage during cleanup",
        objects_to_delete.len()
    );
    let mut delete_tasks = JoinSet::new();
    for object_to_delete in objects_to_delete {
        let task_client = Arc::clone(client);
        delete_tasks.spawn(async move {
            debug!("Deleting remote item at path {object_to_delete:?}");
            task_client
                .delete(&object_to_delete)
                .await
                .with_context(|| format!("{object_to_delete:?} removal"))
        });
    }
    while let Some(task_run_result) = delete_tasks.join_next().await {
        match task_run_result {
            Ok(task_result) => match task_result {
                Ok(()) => {}
                Err(e) => error!("Delete task failed: {e:?}"),
            },
            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
        }
    }
 }
 // Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
 async fn upload_simple_s3_data(
    client: &Arc<GenericRemoteStorage>,
    upload_tasks_count: usize,
 ) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
    info!("Creating {upload_tasks_count} S3 files");
    let mut upload_tasks = JoinSet::new();
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
            let blob_path = RemotePath::new(&blob_path)
                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>(blob_path)
        });
    }
    let mut upload_tasks_failed = false;
    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
    while let Some(task_run_result) = upload_tasks.join_next().await {
        match task_run_result
            .context("task join failed")
            .and_then(|task_result| task_result.context("upload task failed"))
        {
            Ok(upload_path) => {
                uploaded_blobs.insert(upload_path);
            }
            Err(e) => {
                error!("Upload task failed: {e:?}");
                upload_tasks_failed = true;
            }
        }
    }
    if upload_tasks_failed {
        ControlFlow::Break(uploaded_blobs)
    } else {
        ControlFlow::Continue(uploaded_blobs)
    }
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true
 [dependencies]
 atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -1,6 +1,8 @@
 /// Extensions to `std::fs` types.
 use std::{fs, io, path::Path};
 use anyhow::Context;
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
@@ -15,10 +17,19 @@ where
    }
 }
 pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
    let mut dir = tokio::fs::read_dir(&path)
        .await
        .context(format!("read_dir({})", path.as_ref().display()))?;
    Ok(dir.next_entry().await?.is_none())
 }
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;
    use crate::fs_ext::is_directory_empty;
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -42,4 +53,26 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(file_path.is_empty_dir().is_err());
    }
    #[tokio::test]
    async fn is_empty_dir_async() {
        let dir = tempfile::tempdir().unwrap();
        let dir_path = dir.path();
        // test positive case
        assert!(
            is_directory_empty(dir_path).await.expect("test failure"),
            "new tempdir should be empty"
        );
        // invoke on a file to ensure it returns an error
        let file_path: PathBuf = dir_path.join("testfile");
        let f = std::fs::File::create(&file_path).unwrap();
        drop(f);
        assert!(is_directory_empty(&file_path).await.is_err());
        // do it again on a path, we know to be nonexistent
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,19 +1,18 @@
 use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
+use hyper::{header::CONTENT_TYPE, Body, Request, Response};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::net::TcpListener;
 use std::str::FromStr;
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -348,40 +347,6 @@ pub fn check_permission_with(
    }
 }
 ///
 /// Start listening for HTTP requests on given socket.
 ///
 /// 'shutdown_future' can be used to stop. If the Future becomes
 /// ready, we stop listening for new requests, and the function returns.
 ///
 pub fn serve_thread_main<S>(
    router_builder: RouterBuilder<hyper::Body, ApiError>,
    listener: TcpListener,
    shutdown_future: S,
 ) -> anyhow::Result<()>
 where
    S: Future<Output = ()> + Send + Sync,
 {
    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
    // Create a Service from the router above to handle incoming requests.
    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
    // Enter a single-threaded tokio runtime bound to the current thread
    let runtime = tokio::runtime::Builder::new_current_thread()
        .enable_all()
        .build()?;
    let _guard = runtime.enter();
    let server = Server::from_tcp(listener)?
        .serve(service)
        .with_graceful_shutdown(shutdown_future);
    runtime.block_on(server)?;
    Ok(())
 }
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,5 +1,6 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::error;
@@ -15,13 +16,13 @@ pub enum ApiError {
    Unauthorized(String),
    #[error("NotFound: {0}")]
-    NotFound(anyhow::Error),
+    NotFound(Box<dyn StdError + Send + Sync + 'static>),
    #[error("Conflict: {0}")]
    Conflict(String),
    #[error("Precondition failed: {0}")]
-    PreconditionFailed(&'static str),
+    PreconditionFailed(Box<str>),
    #[error(transparent)]
    InternalServerError(anyhow::Error),
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -84,7 +84,7 @@ pub fn init(
    let r = r.with({
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
-            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_ansi(false)
            .with_writer(std::io::stdout);
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,22 +1,23 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
 use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Instant;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
-    let mut layer_map = LayerMap::<LayerDescriptor>::default();
+    let mut layer_map = LayerMap::default();
    let mut min_lsn = Lsn(u64::MAX);
    let mut max_lsn = Lsn(0);
@@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        updates.insert_historic(layer.layer_desc().clone());
    }
    println!("min: {min_lsn}, max: {max_lsn}");
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
 }
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
    // For each image layer we query one of the pages contained, at LSN right
    // before the image layer was created. This gives us a somewhat uniform
    // coverage of both the lsn and key space because image layers have
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn
 // Construct a partitioning for testing get_difficulty map when we
 // don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
    let mut parts = Vec::new();
    // We add a partition boundary at the start of each image layer,
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor {
+        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
-            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            TenantId::generate(),
-            lsn: Lsn(i)..Lsn(i + 1),
+            TimelineId::generate(),
-            is_incremental: false,
+            zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            short_id: format!("Layer {}", i),
+            Lsn(i),
-        };
+            false,
-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+            0,
        ));
        updates.insert_historic(layer.layer_desc().clone());
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -495,50 +495,50 @@ fn start_pageserver(
                Ok(())
            },
        );
    }
-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let background_jobs_barrier = background_jobs_barrier;
+        let background_jobs_barrier = background_jobs_barrier;
-            let metrics_ctx = RequestContext::todo_child(
+        let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
+            TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
+            // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
+            // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
+            // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
+            DownloadBehavior::Error,
-            );
+        );
-            task_mgr::spawn(
+        task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
+            crate::BACKGROUND_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
+            TaskKind::MetricsCollection,
-                None,
+            None,
-                None,
+            None,
-                "consumption metrics collection",
+            "consumption metrics collection",
-                true,
+            true,
-                async move {
+            async move {
-                    // first wait until background jobs are cleared to launch.
+                // first wait until background jobs are cleared to launch.
-                    //
+                //
-                    // this is because we only process active tenants and timelines, and the
+                // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
+                // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token();
-                    tokio::select! {
+                tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
+                    _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
+                    _ = background_jobs_barrier.wait() => {}
-                    };
+                };
-                    pageserver::consumption_metrics::collect_metrics(
+                pageserver::consumption_metrics::collect_metrics(
-                        metric_collection_endpoint,
+                    metric_collection_endpoint,
-                        conf.metric_collection_interval,
+                    conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
+                    conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
+                    conf.synthetic_size_calculation_interval,
-                        conf.id,
+                    conf.id,
-                        metrics_ctx,
+                    metrics_ctx,
-                    )
+                )
-                    .instrument(info_span!("metrics_collection"))
+                .instrument(info_span!("metrics_collection"))
-                    .await?;
+                .await?;
-                    Ok(())
+                Ok(())
-                },
+            },
-            );
+        );
        }
    }
    // Spawn a task to listen for libpq connections. It will spawn further tasks
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -96,12 +96,12 @@ pub mod defaults {
 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
-# [tenant_config]
+[tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
+#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -111,7 +111,8 @@ pub mod defaults {
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false
-# [remote_storage]
+
 [remote_storage]
 "###
    );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,6 +24,8 @@ const RESIDENT_SIZE: &str = "resident_size";
 const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 #[serde_as]
 #[derive(Serialize, Debug)]
 struct Ids {
@@ -73,7 +75,10 @@ pub async fn collect_metrics(
    );
    // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let client = reqwest::ClientBuilder::new()
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
@@ -83,7 +88,7 @@ pub async fn collect_metrics(
                info!("collect_metrics received cancellation request");
                return Ok(());
            },
-            _ = ticker.tick() => {
+            tick_at = ticker.tick() => {
                // send cached metrics every cached_metric_collection_interval
                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
@@ -93,6 +98,12 @@ pub async fn collect_metrics(
                }
                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
                crate::tenant::tasks::warn_when_period_overrun(
                    tick_at.elapsed(),
                    metric_collection_interval,
                    "consumption_metrics_collect_metrics",
                );
            }
        }
    }
@@ -273,31 +284,42 @@ pub async fn collect_metrics_iteration(
        })
        .expect("PageserverConsumptionMetric should not fail serialization");
-        let res = client
+        const MAX_RETRIES: u32 = 3;
            .post(metric_collection_endpoint.clone())
            .json(&chunk_json)
            .send()
            .await;
-        match res {
+        for attempt in 0..MAX_RETRIES {
-            Ok(res) => {
+            let res = client
-                if res.status().is_success() {
+                .post(metric_collection_endpoint.clone())
-                    // update cached metrics after they were sent successfully
+                .json(&chunk_json)
-                    for (curr_key, curr_val) in chunk.iter() {
+                .send()
-                        cached_metrics.insert(curr_key.clone(), *curr_val);
+                .await;
-                    }
+
-                } else {
+            match res {
-                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                Ok(res) => {
-                    for metric in chunk_to_send.iter() {
+                    if res.status().is_success() {
-                        // Report if the metric value is suspiciously large
+                        // update cached metrics after they were sent successfully
-                        if metric.value > (1u64 << 40) {
+                        for (curr_key, curr_val) in chunk.iter() {
                            cached_metrics.insert(curr_key.clone(), *curr_val);
                        }
                    } else {
                        error!("metrics endpoint refused the sent metrics: {:?}", res);
                        for metric in chunk_to_send
                            .iter()
                            .filter(|metric| metric.value > (1u64 << 40))
                        {
                            // Report if the metric value is suspiciously large
                            error!("potentially abnormal metric value: {:?}", metric);
                        }
                    }
                    break;
                }
                Err(err) if err.is_timeout() => {
                    error!(attempt, "timeout sending metrics, retrying immediately");
                    continue;
                }
                Err(err) => {
                    error!(attempt, ?err, "failed to send metrics");
                    break;
                }
            }
            Err(err) => {
                error!("failed to send metrics: {:?}", err);
            }
        }
    }
@@ -317,7 +339,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        _ = ticker.tick() => {
+        tick_at = ticker.tick() => {
                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -343,6 +365,12 @@ pub async fn calculate_synthetic_size_worker(
                    }
                }
                crate::tenant::tasks::warn_when_period_overrun(
                    tick_at.elapsed(),
                    synthetic_size_calculation_interval,
                    "consumption_metrics_synthetic_size_worker",
                );
            }
        }
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -110,7 +110,6 @@ pub fn launch_disk_usage_global_eviction_task(
            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
            info!("disk usage based eviction task finishing");
            Ok(())
        },
    );
@@ -126,13 +125,16 @@ async fn disk_usage_eviction_task(
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
        info!("disk usage based eviction task finishing");
    };
    use crate::tenant::tasks::random_init_delay;
    {
        if random_init_delay(task_config.period, &cancel)
            .await
            .is_err()
        {
            info!("shutting down");
            return;
        }
    }
@@ -167,7 +169,6 @@ async fn disk_usage_eviction_task(
        tokio::select! {
            _ = tokio::time::sleep_until(sleep_until) => {},
            _ = cancel.cancelled() => {
                info!("shutting down");
                break
            }
        }
@@ -314,7 +315,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            partition,
            candidate.layer.get_tenant_id(),
            candidate.layer.get_timeline_id(),
-            candidate.layer.filename().file_name(),
+            candidate.layer,
        );
    }
@@ -516,7 +517,7 @@ async fn collect_eviction_candidates(
            if !tl.is_active() {
                continue;
            }
-            let info = tl.get_local_layers_for_disk_usage_eviction();
+            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -186,10 +186,8 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
    delete:
-      description: "Attempts to delete specified timeline. On 500 errors should be retried"
+      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
        "200":
          description: Ok
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
@@ -214,8 +212,14 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
          description: Deletion is already in progress, continue polling
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "412":
-          description: Tenant is missing
+          description: Tenant is missing, or timeline has children
          content:
            application/json:
              schema:
@@ -386,6 +390,7 @@ paths:
        "202":
          description: Tenant attaching scheduled
        "400":
          description: Bad Request
          content:
            application/json:
              schema:
@@ -717,6 +722,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "406":
          description: Permanently unsatisfiable request, don't retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "409":
          description: Timeline already exists, creation skipped
          content:
@@ -945,7 +956,7 @@ components:
              type: string
              enum: [ "maybe", "attached", "failed" ]
            data:
-            - type: object
+              type: object
              properties:
                reason:
                  type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -23,7 +23,6 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::disk_usage_eviction_task;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -35,6 +34,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
    auth::JwtAuth,
    http::{
@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            e @ GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
            SetNewTenantConfigError::GetTenant(tid) => {
-                ApiError::NotFound(anyhow!("tenant {}", tid))
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
            }
            e @ SetNewTenantConfigError::Persist(_) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
@@ -182,10 +182,12 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
-            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
+            HasChildren(children) => ApiError::PreconditionFailed(
-                "Cannot delete timeline which has child timelines"
+                format!("Cannot delete timeline which has child timelines: {children:?}")
-            )),
+                    .into_boxed_str(),
            ),
            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -197,9 +199,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(GetTenantError::NotFound(..)) => {
+            Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
-                ApiError::PreconditionFailed("Requested tenant is missing")
+                "Requested tenant is missing".to_owned().into_boxed_str(),
-            }
+            ),
            Tenant(t) => ApiError::from(t),
            Timeline(t) => ApiError::from(t),
        }
@@ -214,7 +216,7 @@ async fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
-    let mut info = build_timeline_info_common(timeline, ctx)?;
+    let mut info = build_timeline_info_common(timeline, ctx).await?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -232,7 +234,7 @@ async fn build_timeline_info(
    Ok(info)
 }
-fn build_timeline_info_common(
+async fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
@@ -263,7 +265,7 @@ fn build_timeline_info_common(
            None
        }
    };
-    let current_physical_size = Some(timeline.layer_size_sum());
+    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
@@ -326,14 +328,22 @@ async fn timeline_create_handler(
            &ctx,
        )
        .await {
-            Ok(Some(new_timeline)) => {
+            Ok(new_timeline) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(tenant::CreateTimelineError::AlreadyExists) => {
-            Err(err) => Err(ApiError::InternalServerError(err)),
+                json_response(StatusCode::CONFLICT, ())
            }
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
                    format!("{err:#}")
                ))
            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
@@ -395,7 +405,7 @@ async fn timeline_detail_handler(
        let timeline = tenant
            .get_timeline(timeline_id, false)
-            .map_err(ApiError::NotFound)?;
+            .map_err(|e| ApiError::NotFound(e.into()))?;
        let timeline_info = build_timeline_info(
            &timeline,
@@ -494,7 +504,8 @@ async fn timeline_delete_handler(
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await?;
-    json_response(StatusCode::OK, ())
+    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
    json_response(StatusCode::ACCEPTED, ())
 }
 async fn tenant_detach_handler(
@@ -589,7 +600,7 @@ async fn tenant_status(
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum();
+            current_physical_size += timeline.layer_size_sum().await;
        }
        let state = tenant.current_state();
@@ -699,7 +710,7 @@ async fn layer_map_info_handler(
    check_permission(&request, Some(tenant_id))?;
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset);
+    let layer_map_info = timeline.layer_map_info(reset).await;
    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1058,7 +1069,7 @@ async fn timeline_download_remote_layers_handler_get(
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    json_response(StatusCode::OK, info)
 }
@@ -1069,7 +1080,7 @@ async fn active_timeline_of_active_tenant(
    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)
+        .map_err(|e| ApiError::NotFound(e.into()))
 }
 async fn always_panic_handler(
@@ -1125,8 +1136,6 @@ async fn disk_usage_eviction_run(
        freed_bytes: 0,
    };
    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
    let (tx, rx) = tokio::sync::oneshot::channel();
    let state = get_state(&r);
@@ -1144,7 +1153,7 @@ async fn disk_usage_eviction_run(
    let _g = cancel.drop_guard();
    crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush()?;
+            modification.flush().await?;
        }
    }
    // We're done importing all the data files.
-    modification.commit()?;
+    modification.commit().await?;
    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -148,17 +148,17 @@ async fn import_rel(
    // because there is no guarantee about the order in which we are processing segments.
    // ignore "relation already exists" error
    //
-    // FIXME: use proper error type for this, instead of parsing the error message.
+    // FIXME: Keep track of which relations we've already created?
    // Or better yet, keep track of which relations we've already created
    // https://github.com/neondatabase/neon/issues/3309
    if let Err(e) = modification
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        if e.to_string().contains("already exists") {
+        match e {
-            debug!("relation {} already exists. we must be extending it", rel);
+            RelationError::AlreadyExists => {
-        } else {
+                debug!("Relation {} already exist. We must be extending it.", rel)
-            return Err(e);
+            }
            _ => return Err(e.into()),
        }
    }
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush()?;
+                modification.flush().await?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;
-    modification.commit()?;
+    modification.commit().await?;
    Ok(())
 }
@@ -594,7 +594,7 @@ async fn import_file(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.writer();
+        let writer = modification.tline.writer().await;
        writer.finish_write(prev_lsn);
        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
-use metrics::core::{AtomicU64, GenericCounter};
+use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    UIntGauge, UIntGaugeVec,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
@@ -95,21 +95,19 @@ static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
 });
 // Metrics collected on operations on the storage repository.
-static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram_vec!(
+    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
-        "Time spent in reconstruct_value",
+        "Time spent in reconstruct_value (reconstruct a page from deltas)",
        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
-static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter_vec!(
+    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -124,15 +122,130 @@ static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
-static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter_vec!(
+    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
 pub struct PageCacheMetrics {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_ephemeral: IntCounter,
    pub read_accesses_immutable: IntCounter,
    pub read_hits_ephemeral: IntCounter,
    pub read_hits_immutable: IntCounter,
    pub read_hits_materialized_page_exact: IntCounter,
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }
 static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_page_cache_read_hits_total",
        "Number of read accesses to the page cache that hit",
        &["key_kind", "hit_kind"]
    )
    .expect("failed to define a metric")
 });
 static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_page_cache_read_accesses_total",
        "Number of read accesses to the page cache",
        &["key_kind"]
    )
    .expect("failed to define a metric")
 });
 pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    read_accesses_materialized_page: {
        PAGE_CACHE_READ_ACCESSES
            .get_metric_with_label_values(&["materialized_page"])
            .unwrap()
    },
    read_accesses_ephemeral: {
        PAGE_CACHE_READ_ACCESSES
            .get_metric_with_label_values(&["ephemeral"])
            .unwrap()
    },
    read_accesses_immutable: {
        PAGE_CACHE_READ_ACCESSES
            .get_metric_with_label_values(&["immutable"])
            .unwrap()
    },
    read_hits_ephemeral: {
        PAGE_CACHE_READ_HITS
            .get_metric_with_label_values(&["ephemeral", "-"])
            .unwrap()
    },
    read_hits_immutable: {
        PAGE_CACHE_READ_HITS
            .get_metric_with_label_values(&["immutable", "-"])
            .unwrap()
    },
    read_hits_materialized_page_exact: {
        PAGE_CACHE_READ_HITS
            .get_metric_with_label_values(&["materialized_page", "exact"])
            .unwrap()
    },
    read_hits_materialized_page_older_lsn: {
        PAGE_CACHE_READ_HITS
            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
            .unwrap()
    },
 });
 pub struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,
    pub current_bytes_ephemeral: UIntGauge,
    pub current_bytes_immutable: UIntGauge,
    pub current_bytes_materialized_page: UIntGauge,
 }
 static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_page_cache_size_current_bytes",
        "Current size of the page cache in bytes, by key kind",
        &["key_kind"]
    )
    .expect("failed to define a metric")
 });
 pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
    max_bytes: {
        register_uint_gauge!(
            "pageserver_page_cache_size_max_bytes",
            "Maximum size of the page cache in bytes"
        )
        .expect("failed to define a metric")
    },
    current_bytes_ephemeral: {
        PAGE_CACHE_SIZE_CURRENT_BYTES
            .get_metric_with_label_values(&["ephemeral"])
            .unwrap()
    },
    current_bytes_immutable: {
        PAGE_CACHE_SIZE_CURRENT_BYTES
            .get_metric_with_label_values(&["immutable"])
            .unwrap()
    },
    current_bytes_materialized_page: {
        PAGE_CACHE_SIZE_CURRENT_BYTES
            .get_metric_with_label_values(&["materialized_page"])
            .unwrap()
    },
 });
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
@@ -207,11 +320,11 @@ pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
 pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
-        "pageserver_tenant_synthetic_size",
+        "pageserver_tenant_synthetic_cached_size_bytes",
-        "Synthetic size of each tenant",
+        "Synthetic size of each tenant in bytes",
        &["tenant_id"]
    )
-    .expect("Failed to register pageserver_tenant_synthetic_size metric")
+    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
@@ -428,6 +541,27 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 pub struct BasebackupQueryTime(HistogramVec);
 pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
        register_histogram_vec!(
            "pageserver_basebackup_query_seconds",
            "Histogram of basebackup queries durations, by result type",
            &["result"],
            CRITICAL_OP_BUCKETS.into(),
        )
        .expect("failed to define a metric")
    })
 });
 impl DurationResultObserver for BasebackupQueryTime {
    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
        let label_value = if res.is_ok() { "ok" } else { "error" };
        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
        metric.observe(duration.as_secs_f64());
    }
 }
 pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
@@ -752,10 +886,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
    pub reconstruct_time_histo: Histogram,
    pub get_reconstruct_data_time_histo: Histogram,
    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
    pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -783,15 +914,9 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
        let reconstruct_time_histo = RECONSTRUCT_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -833,19 +958,13 @@ impl TimelineMetrics {
        let read_num_fs_layers = READ_NUM_FS_LAYERS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
        TimelineMetrics {
            tenant_id,
            timeline_id,
            reconstruct_time_histo,
            get_reconstruct_data_time_histo,
            materialized_page_cache_hit_counter,
            materialized_page_cache_hit_upon_request_counter,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -872,10 +991,7 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -968,7 +1084,6 @@ impl RemoteTimelineClientMetrics {
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
        let metric = guard.entry(key).or_insert_with(move || {
@@ -990,7 +1105,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntGauge {
        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1011,7 +1125,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1032,7 +1145,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_started_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1053,7 +1165,6 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_finished_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1319,4 +1430,8 @@ pub fn preinitialize_metrics() {
    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
    // Python tests need these.
    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,8 +53,8 @@ use utils::{
    lsn::Lsn,
 };
 use crate::repository::Key;
 use crate::tenant::writeback_ephemeral_file;
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -187,6 +187,8 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
    size_metrics: &'static PageCacheSizeMetrics,
 }
 ///
@@ -313,6 +315,10 @@ impl PageCache {
        key: &Key,
        lsn: Lsn,
    ) -> Option<(Lsn, PageReadGuard)> {
        crate::metrics::PAGE_CACHE
            .read_accesses_materialized_page
            .inc();
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
@@ -323,8 +329,21 @@ impl PageCache {
        };
        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
+            if let CacheKey::MaterializedPage {
-                Some((lsn, guard))
+                hash_key: _,
                lsn: available_lsn,
            } = cache_key
            {
                if available_lsn == lsn {
                    crate::metrics::PAGE_CACHE
                        .read_hits_materialized_page_exact
                        .inc();
                } else {
                    crate::metrics::PAGE_CACHE
                        .read_hits_materialized_page_older_lsn
                        .inc();
                }
                Some((available_lsn, guard))
            } else {
                panic!("unexpected key type in slot");
            }
@@ -499,11 +518,31 @@ impl PageCache {
    /// ```
    ///
    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
            CacheKey::EphemeralPage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
            ),
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
                &crate::metrics::PAGE_CACHE.read_hits_immutable,
            ),
        };
        read_access.inc();
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
                if is_first_iteration {
                    hit.inc();
                }
                return Ok(ReadBufResult::Found(read_guard));
            }
            is_first_iteration = false;
            // Not found. Find a victim buffer
            let (slot_idx, mut inner) =
@@ -681,6 +720,9 @@ impl PageCache {
                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
                        versions.remove(version_idx);
                        self.size_metrics
                            .current_bytes_materialized_page
                            .sub_page_sz(1);
                        if versions.is_empty() {
                            old_entry.remove_entry();
                        }
@@ -693,11 +735,13 @@ impl PageCache {
                let mut map = self.ephemeral_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
            }
        }
    }
@@ -725,6 +769,9 @@ impl PageCache {
                                slot_idx,
                            },
                        );
                        self.size_metrics
                            .current_bytes_materialized_page
                            .add_page_sz(1);
                        None
                    }
                }
@@ -735,6 +782,7 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
                        None
                    }
                }
@@ -745,6 +793,7 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
                        None
                    }
                }
@@ -844,6 +893,12 @@ impl PageCache {
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
        size_metrics.current_bytes_ephemeral.set_page_sz(0);
        size_metrics.current_bytes_immutable.set_page_sz(0);
        size_metrics.current_bytes_materialized_page.set_page_sz(0);
        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
            .map(|chunk| {
@@ -866,6 +921,30 @@ impl PageCache {
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
        }
    }
 }
 trait PageSzBytesMetric {
    fn set_page_sz(&self, count: usize);
    fn add_page_sz(&self, count: usize);
    fn sub_page_sz(&self, count: usize);
 }
 #[inline(always)]
 fn count_times_page_sz(count: usize) -> u64 {
    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
 }
 impl PageSzBytesMetric for metrics::UIntGauge {
    fn set_page_sz(&self, count: usize) {
        self.set(count_times_page_sz(count));
    }
    fn add_page_sz(&self, count: usize) {
        self.add(count_times_page_sz(count));
    }
    fn sub_page_sz(&self, count: usize) {
        self.sub(count_times_page_sz(count));
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -390,7 +390,9 @@ impl PageServerHandler {
        };
        // Check that the timeline exists
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
            .get_timeline(timeline_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
@@ -902,7 +904,7 @@ where
            self.check_permission(Some(tenant_id))?;
-            let lsn = if params.len() == 3 {
+            let lsn = if params.len() >= 3 {
                Some(
                    Lsn::from_str(params[2])
                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
@@ -911,10 +913,24 @@ where
                None
            };
-            // Check that the timeline exists
+            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
+                &*crate::metrics::BASEBACKUP_QUERY_TIME,
-                .await?;
+                async move {
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                    self.handle_basebackup_request(
                        pgb,
                        tenant_id,
                        timeline_id,
                        lsn,
                        None,
                        false,
                        ctx,
                    )
                    .await?;
                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
                    anyhow::Ok(())
                },
            )
            .await?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1230,6 +1246,6 @@ async fn get_active_tenant_timeline(
        .map_err(GetActiveTimelineError::Tenant)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(GetActiveTimelineError::Timeline)?;
+        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
    Ok(timeline)
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -43,6 +43,16 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
    #[error("Relation Already Exists")]
    AlreadyExists,
    #[error("invalid relnode")]
    InvalidRelnode,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -101,9 +111,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
+            return Err(PageReconstructError::Other(
-                "invalid relnode"
+                RelationError::InvalidRelnode.into(),
-            )));
+            ));
        }
        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
@@ -148,9 +158,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
+            return Err(PageReconstructError::Other(
-                "invalid relnode"
+                RelationError::InvalidRelnode.into(),
-            )));
+            ));
        }
        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -193,9 +203,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
+            return Err(PageReconstructError::Other(
-                "invalid relnode"
+                RelationError::InvalidRelnode.into(),
-            )));
+            ));
        }
        // first try to lookup relation in cache
@@ -699,6 +709,20 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }
    #[cfg(test)]
    pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
        self.init_empty()?;
        self.put_control_file(bytes::Bytes::from_static(
            b"control_file contents do not matter",
        ))
        .context("put_control_file")?;
        self.put_checkpoint(bytes::Bytes::from_static(
            b"checkpoint_file contents do not matter",
        ))
        .context("put_checkpoint_file")?;
        Ok(())
    }
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -710,7 +734,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        rec: NeonWalRecord,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -737,7 +761,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -861,32 +885,38 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), RelationError> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        if rel.relnode == 0 {
            return Err(RelationError::InvalidRelnode);
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
            .context("deserialize db")?;
        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
            // Didn't exist. Update dbdir
            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir)?;
+            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
            // and create the RelDirectory
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
                .context("deserialize db")?
        };
        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            anyhow::bail!("rel {rel} already exists");
+            return Err(RelationError::AlreadyExists);
        }
        self.put(
            rel_dir_key,
-            Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
+            Value::Image(Bytes::from(
                RelDirectory::ser(&rel_dir).context("serialize")?,
            )),
        );
        // Put size
@@ -911,7 +941,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        let last_lsn = self.tline.get_last_record_lsn();
        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
@@ -942,7 +972,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        // Put size
        let size_key = rel_size_to_key(rel);
@@ -963,7 +993,7 @@ impl<'a> DatadirModification<'a> {
    /// Drop a relation.
    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        // Remove it from the directory entry
        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
@@ -1108,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1116,19 +1146,20 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }
-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;
        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: anyhow::Result<()> = Ok(());
+        let mut retained_pending_updates = HashMap::new();
-        self.pending_updates.retain(|&key, value| {
+        for (key, value) in self.pending_updates.drain() {
-            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
-                result = writer.put(key, self.lsn, value);
+                // This bails out on first error without modifying pending_updates.
-                false
+                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value).await?;
            } else {
-                true
+                retained_pending_updates.insert(key, value);
            }
-        });
+        }
-        result?;
+        self.pending_updates.extend(retained_pending_updates);
        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1143,17 +1174,17 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub fn commit(&mut self) -> anyhow::Result<()> {
+    pub async fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;
        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value)?;
+            writer.put(key, lsn, &value).await?;
        }
        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn)?;
+            writer.delete(key_range, lsn).await?;
        }
        writer.finish_write(lsn);
@@ -1593,20 +1624,6 @@ fn is_slru_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff // and not SlruSegSize
 }
 #[cfg(test)]
 pub fn create_test_timeline(
    tenant: &crate::tenant::Tenant,
    timeline_id: utils::id::TimelineId,
    pg_version: u32,
    ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
    let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
    m.commit()?;
    Ok(tline)
 }
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,6 +257,9 @@ pub enum TaskKind {
    // task that handles attaching a tenant
    Attach,
    // Used mostly for background deletion from s3
    TimelineDeletionWorker,
    // task that handhes metrics collection
    MetricsCollection,
@@ -503,17 +506,17 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            let completed = tokio::select! {
+            let join_handle = tokio::select! {
                biased;
-                _ = &mut join_handle => { true },
+                _ = &mut join_handle => { None },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
                    // lines.
                    info!("waiting for {} to shut down", task.name);
-                    false
+                    Some(join_handle)
                }
            };
-            if !completed {
+            if let Some(join_handle) = join_handle {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -38,8 +38,8 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
 use anyhow::Context;
 use anyhow::Result;
 use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::Replacement;
+pub use historic_layer_coverage::LayerKey;
 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
 use super::storage_layer::PersistentLayerKey;
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-pub struct LayerMap<L: ?Sized> {
+#[derive(Default)]
 pub struct LayerMap {
    //
    // 'open_layer' holds the current InMemoryLayer that is accepting new
    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -95,24 +93,6 @@ pub struct LayerMap<L: ?Sized> {
    /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
    /// RemoteLayer will be removed.
    mapping: HashMap<PersistentLayerKey, Arc<L>>,
 }
 impl<L: ?Sized> Default for LayerMap<L> {
    fn default() -> Self {
        Self {
            open_layer: None,
            next_open_layer_at: None,
            frozen_layers: VecDeque::default(),
            l0_delta_layers: Vec::default(),
            historic: BufferedHistoricLayerCoverage::default(),
            mapping: HashMap::default(),
        }
    }
 }
 /// The primary update API for the layer map.
@@ -120,24 +100,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
 /// Batching historic layer insertions and removals is good for
 /// performance and this struct helps us do that correctly.
 #[must_use]
-pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+pub struct BatchedUpdates<'a> {
    // While we hold this exclusive reference to the layer map the type checker
    // will prevent us from accidentally reading any unflushed updates.
-    layer_map: &'a mut LayerMap<L>,
+    layer_map: &'a mut LayerMap,
 }
 /// Provide ability to batch more updates while hiding the read
 /// API so we don't accidentally read without flushing.
-impl<L> BatchedUpdates<'_, L>
+impl BatchedUpdates<'_> {
 where
    L: ?Sized + Layer,
 {
    ///
    /// Insert an on-disk layer.
    ///
    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
-        self.layer_map.insert_historic_noflush(layer_desc, layer)
+        self.layer_map.insert_historic_noflush(layer_desc)
    }
    ///
@@ -145,31 +122,8 @@ where
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
-        self.layer_map.remove_historic_noflush(layer_desc, layer)
+        self.layer_map.remove_historic_noflush(layer_desc)
    }
    /// Replaces existing layer iff it is the `expected`.
    ///
    /// If the expected layer has been removed it will not be inserted by this function.
    ///
    /// Returned `Replacement` describes succeeding in replacement or the reason why it could not
    /// be done.
    ///
    /// TODO replacement can be done without buffering and rebuilding layer map updates.
    ///      One way to do that is to add a layer of indirection for returned values, so
    ///      that we can replace values only by updating a hashmap.
    pub fn replace_historic(
        &mut self,
        expected_desc: PersistentLayerDesc,
        expected: &Arc<L>,
        new_desc: PersistentLayerDesc,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
        self.layer_map
            .replace_historic_noflush(expected_desc, expected, new_desc, new)
    }
    // We will flush on drop anyway, but this method makes it
@@ -185,25 +139,19 @@ where
 // than panic later or read without flushing.
 //
 // TODO maybe warn if flush hasn't explicitly been called
-impl<L> Drop for BatchedUpdates<'_, L>
+impl Drop for BatchedUpdates<'_> {
 where
    L: ?Sized + Layer,
 {
    fn drop(&mut self) {
        self.layer_map.flush_updates();
    }
 }
 /// Return value of LayerMap::search
-pub struct SearchResult<L: ?Sized> {
+pub struct SearchResult {
-    pub layer: Arc<L>,
+    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }
-impl<L> LayerMap<L>
+impl LayerMap {
 where
    L: ?Sized + Layer,
 {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
    /// 'key', with lsn.start < 'end_lsn'.
@@ -235,7 +183,7 @@ where
    /// NOTE: This only searches the 'historic' layers, *not* the
    /// 'open' and 'frozen' layers!
    ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());
@@ -244,7 +192,6 @@ where
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
                let image = self.get_layer_from_mapping(&image.key()).clone();
                Some(SearchResult {
                    layer: image,
                    lsn_floor,
@@ -252,7 +199,6 @@ where
            }
            (Some(delta), None) => {
                let lsn_floor = delta.get_lsn_range().start;
                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                Some(SearchResult {
                    layer: delta,
                    lsn_floor,
@@ -263,7 +209,6 @@ where
                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                let image_exact_match = img_lsn + 1 == end_lsn;
                if image_is_newer || image_exact_match {
                    let image = self.get_layer_from_mapping(&image.key()).clone();
                    Some(SearchResult {
                        layer: image,
                        lsn_floor: img_lsn,
@@ -271,7 +216,6 @@ where
                } else {
                    let lsn_floor =
                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                    Some(SearchResult {
                        layer: delta,
                        lsn_floor,
@@ -282,7 +226,7 @@ where
    }
    /// Start a batch of updates, applied on drop
-    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
        BatchedUpdates { layer_map: self }
    }
@@ -292,48 +236,32 @@ where
    /// Helper function for BatchedUpdates::insert_historic
    ///
    /// TODO(chi): remove L generic so that we do not need to pass layer object.
-    pub(self) fn insert_historic_noflush(
+    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        &mut self,
        layer_desc: PersistentLayerDesc,
        layer: Arc<L>,
    ) {
        self.mapping.insert(layer_desc.key(), layer.clone());
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
-        if Self::is_l0(&layer) {
+        if Self::is_l0(&layer_desc) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }
        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&*layer),
+            historic_layer_coverage::LayerKey::from(&layer_desc),
            layer_desc.into(),
        );
    }
    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
        let layer = self
            .mapping
            .get(key)
            .with_context(|| format!("{key:?}"))
            .expect("inconsistent layer mapping");
        layer
    }
    ///
    /// Remove an on-disk layer from the map.
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&*layer));
+            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
-        if Self::is_l0(&layer) {
+        let layer_key = layer_desc.key();
        if Self::is_l0(&layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-            l0_delta_layers.retain(|other| {
+            l0_delta_layers.retain(|other| other.key() != layer_key);
                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
            });
            self.l0_delta_layers = l0_delta_layers;
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
@@ -344,69 +272,6 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
        self.mapping.remove(&layer_desc.key());
    }
    pub(self) fn replace_historic_noflush(
        &mut self,
        expected_desc: PersistentLayerDesc,
        expected: &Arc<L>,
        new_desc: PersistentLayerDesc,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
        let key = historic_layer_coverage::LayerKey::from(&**expected);
        let other = historic_layer_coverage::LayerKey::from(&*new);
        let expected_l0 = Self::is_l0(expected);
        let new_l0 = Self::is_l0(&new);
        anyhow::ensure!(
            key == other,
            "expected and new must have equal LayerKeys: {key:?} != {other:?}"
        );
        anyhow::ensure!(
            expected_l0 == new_l0,
            "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
        );
        let l0_index = if expected_l0 {
            // find the index in case replace worked, we need to replace that as well
            let pos = self.l0_delta_layers.iter().position(|slot| {
                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
            });
            if pos.is_none() {
                return Ok(Replacement::NotFound);
            }
            pos
        } else {
            None
        };
        let new_desc = Arc::new(new_desc);
        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
            **existing == expected_desc
        });
        if let Replacement::Replaced { .. } = &replaced {
            self.mapping.remove(&expected_desc.key());
            self.mapping.insert(new_desc.key(), new);
            if let Some(index) = l0_index {
                self.l0_delta_layers[index] = new_desc;
            }
        }
        let replaced = match replaced {
            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
            Replacement::NotFound => Replacement::NotFound,
            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
            Replacement::Unexpected(x) => {
                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
            }
        };
        Ok(replaced)
    }
    /// Helper function for BatchedUpdates::drop.
@@ -454,10 +319,8 @@ where
        Ok(true)
    }
-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
-        self.historic
+        self.historic.iter()
            .iter()
            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
    }
    ///
@@ -472,7 +335,7 @@ where
        &self,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
        let version = match self.historic.get().unwrap().get_version(lsn.0) {
            Some(v) => v,
            None => return Ok(vec![]),
@@ -482,36 +345,26 @@ where
        let end = key_range.end.to_i128();
        // Initialize loop variables
-        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
        let mut current_key = start;
        let mut current_val = version.image_coverage.query(start);
        // Loop through the change events and push intervals
        for (change_key, change_val) in version.image_coverage.range(start..end) {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((
+            coverage.push((kr, current_val.take()));
                kr,
                current_val
                    .take()
                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
            ));
            current_key = change_key;
            current_val = change_val.clone();
        }
        // Add the final interval
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((
+        coverage.push((kr, current_val.take()));
            kr,
            current_val
                .take()
                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
        ));
        Ok(coverage)
    }
-    pub fn is_l0(layer: &L) -> bool {
+    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
    }
@@ -537,7 +390,7 @@ where
    /// TODO The optimal number should probably be slightly higher than 1, but to
    ///      implement that we need to plumb a lot more context into this function
    ///      than just the current partition_range.
-    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
        if !Self::is_l0(layer) {
            return true;
@@ -595,9 +448,7 @@ where
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count =
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
                                as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath =
                            self.count_deltas(&kr, &lr, new_limit)?;
@@ -620,9 +471,7 @@ where
                let lr = lsn.start..val.get_lsn_range().start;
                if !kr.is_empty() {
-                    let base_count =
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
                            as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                    max_stacked_deltas = std::cmp::max(
@@ -772,12 +621,8 @@ where
    }
    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
-        Ok(self
+        Ok(self.l0_delta_layers.to_vec())
            .l0_delta_layers
            .iter()
            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
            .collect())
    }
    /// debugging function to print out the contents of the layer map
@@ -802,72 +647,51 @@ where
        println!("End dump LayerMap");
        Ok(())
    }
    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
    ///
    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
    #[inline(always)]
    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
        // "dyn Trait" objects are "fat pointers" in that they have two components:
        // - pointer to the object
        // - pointer to the vtable
        //
        // rust does not provide a guarantee that these vtables are unique, but however
        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
        // pointer and the vtable need to be equal.
        //
        // See: https://github.com/rust-lang/rust/issues/103763
        //
        // A future version of rust will most likely use this form below, where we cast each
        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
        // not affect the comparison.
        //
        // See: https://github.com/rust-lang/rust/pull/106450
        let left = Arc::as_ptr(left) as *const ();
        let right = Arc::as_ptr(right) as *const ();
        left == right
    }
 }
 #[cfg(test)]
 mod tests {
-    use super::{LayerMap, Replacement};
+    use super::LayerMap;
-    use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
    use std::str::FromStr;
    use std::sync::Arc;
    mod l0_delta_layers_updated {
        use crate::tenant::{
            storage_layer::{PersistentLayer, PersistentLayerDesc},
            timeline::LayerFileManager,
        };
        use super::*;
        #[test]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                true
+                 true
-            )
+             )
        }
        #[test]
        fn for_non_full_range_delta() {
            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                // because not full range
+                 // because not full range
-                false
+                 false
-            )
+             )
        }
        #[test]
        fn for_image() {
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                // code only checks if it is a full range layer, doesn't care about images, which must
+                 // code only checks if it is a full range layer, doesn't care about images, which must
-                // mean we should in practice never have full range images
+                 // mean we should in practice never have full range images
-                false
+                 false
-            )
+             )
        }
        #[test]
@@ -883,16 +707,16 @@ mod tests {
            let not_found = Arc::new(layer.clone());
            let new_version = Arc::new(layer);
-            let mut map = LayerMap::default();
+            // after the immutable storage state refactor, the replace operation
            // will not use layer map any more. We keep it here for consistency in test cases
            // and can remove it in the future.
            let _map = LayerMap::default();
-            let res = map.batch_update().replace_historic(
+            let mut mapping = LayerFileManager::new();
                not_found.get_persistent_layer_desc(),
                &not_found,
                new_version.get_persistent_layer_desc(),
                new_version,
            );
-            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
+            mapping
                .replace_and_verify(not_found, new_version)
                .unwrap_err();
        }
        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
@@ -903,49 +727,44 @@ mod tests {
            let downloaded = Arc::new(skeleton);
            let mut map = LayerMap::default();
            let mut mapping = LayerFileManager::new();
            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
            let expected_in_counts = (1, usize::from(expected_l0));
            map.batch_update()
-                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
+                .insert_historic(remote.layer_desc().clone());
-            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
+            mapping.insert(remote.clone());
-
+            assert_eq!(
-            let replaced = map
+                count_layer_in(&map, remote.layer_desc()),
-                .batch_update()
+                expected_in_counts
-                .replace_historic(
+            );
-                    remote.get_persistent_layer_desc(),
+
-                    &remote,
+            mapping
-                    downloaded.get_persistent_layer_desc(),
+                .replace_and_verify(remote, downloaded.clone())
-                    downloaded.clone(),
+                .expect("name derived attributes are the same");
-                )
+            assert_eq!(
-                .expect("name derived attributes are the same");
+                count_layer_in(&map, downloaded.layer_desc()),
-            assert!(
+                expected_in_counts
                matches!(replaced, Replacement::Replaced { .. }),
                "{replaced:?}"
            );
            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
            map.batch_update()
-                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
+                .remove_historic(downloaded.layer_desc().clone());
-            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }
-        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
+                .filter(|x| x.key() == layer.key())
                .count();
            let l0s = map
                .get_level0_deltas()
                .expect("why does this return a result");
-            let l0 = l0s
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
                .iter()
                .filter(|x| LayerMap::compare_arced_layers(x, layer))
                .count();
            (historic, l0)
        }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,6 +3,8 @@ use std::ops::Range;
 use tracing::info;
 use crate::tenant::storage_layer::PersistentLayerDesc;
 use super::layer_coverage::LayerCoverageTuple;
 /// Layers in this module are identified and indexed by this data.
@@ -41,8 +43,8 @@ impl Ord for LayerKey {
    }
 }
-impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
+impl From<&PersistentLayerDesc> for LayerKey {
-    fn from(layer: &'a L) -> Self {
+    fn from(layer: &PersistentLayerDesc) -> Self {
        let kr = layer.get_key_range();
        let lr = layer.get_lsn_range();
        LayerKey {
@@ -454,59 +456,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        self.buffer.insert(layer_key, None);
    }
    /// Replaces a previous layer with a new layer value.
    ///
    /// The replacement is conditional on:
    /// - there is an existing `LayerKey` record
    /// - there is no buffered removal for the given `LayerKey`
    /// - the given closure returns true for the current `Value`
    ///
    /// The closure is used to compare the latest value (buffered insert, or existing layer)
    /// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
    /// inaccessible via `PartialEq` trait.
    ///
    /// Returns a `Replacement` value describing the outcome; only the case of
    /// `Replacement::Replaced` modifies the map and requires a rebuild.
    pub fn replace<F>(
        &mut self,
        layer_key: &LayerKey,
        new: Value,
        check_expected: F,
    ) -> Replacement<Value>
    where
        F: FnOnce(&Value) -> bool,
    {
        let (slot, in_buffered) = match self.buffer.get(layer_key) {
            Some(inner @ Some(_)) => {
                // we compare against the buffered version, because there will be a later
                // rebuild before querying
                (inner.as_ref(), true)
            }
            Some(None) => {
                // buffer has removal for this key; it will not be equivalent by any check_expected.
                return Replacement::RemovalBuffered;
            }
            None => {
                // no pending modification for the key, check layers
                (self.layers.get(layer_key), false)
            }
        };
        match slot {
            Some(existing) if !check_expected(existing) => {
                // unfortunate clone here, but otherwise the nll borrowck grows the region of
                // 'a to cover the whole function, and we could not mutate in the other
                // Some(existing) branch
                Replacement::Unexpected(existing.clone())
            }
            None => Replacement::NotFound,
            Some(_existing) => {
                self.insert(layer_key.to_owned(), new);
                Replacement::Replaced { in_buffered }
            }
        }
    }
    pub fn rebuild(&mut self) {
        // Find the first LSN that needs to be rebuilt
        let rebuild_since: u64 = match self.buffer.iter().next() {
@@ -575,22 +524,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    }
 }
 /// Outcome of the replace operation.
 #[derive(Debug)]
 pub enum Replacement<Value> {
    /// Previous value was replaced with the new value.
    Replaced {
        /// Replacement happened for a scheduled insert.
        in_buffered: bool,
    },
    /// Key was not found buffered updates or existing layers.
    NotFound,
    /// Key has been scheduled for removal, it was not replaced.
    RemovalBuffered,
    /// Previous value was rejected by the closure.
    Unexpected(Value),
 }
 #[test]
 fn test_retroactive_regression_1() {
    let mut map = BufferedHistoricLayerCoverage::new();
@@ -699,139 +632,3 @@ fn test_retroactive_simple() {
        assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
    }
 }
 #[test]
 fn test_retroactive_replacement() {
    let mut map = BufferedHistoricLayerCoverage::new();
    let keys = [
        LayerKey {
            key: 0..5,
            lsn: 100..101,
            is_image: true,
        },
        LayerKey {
            key: 3..9,
            lsn: 110..111,
            is_image: true,
        },
        LayerKey {
            key: 4..6,
            lsn: 120..121,
            is_image: true,
        },
    ];
    let layers = [
        "Image 1".to_string(),
        "Image 2".to_string(),
        "Image 3".to_string(),
    ];
    for (key, layer) in keys.iter().zip(layers.iter()) {
        map.insert(key.to_owned(), layer.to_owned());
    }
    // rebuild is not necessary here, because replace works for both buffered updates and existing
    // layers.
    for (key, orig_layer) in keys.iter().zip(layers.iter()) {
        let replacement = format!("Remote {orig_layer}");
        // evict
        let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
        assert!(
            matches!(ret, Replacement::Replaced { .. }),
            "replace {orig_layer}: {ret:?}"
        );
        map.rebuild();
        let at = key.lsn.end + 1;
        let version = map.get().expect("rebuilt").get_version(at).unwrap();
        assert_eq!(
            version.image_coverage.query(4).as_deref(),
            Some(replacement.as_str()),
            "query for 4 at version {at} after eviction",
        );
        // download
        let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
        assert!(
            matches!(ret, Replacement::Replaced { .. }),
            "replace {orig_layer} back: {ret:?}"
        );
        map.rebuild();
        let version = map.get().expect("rebuilt").get_version(at).unwrap();
        assert_eq!(
            version.image_coverage.query(4).as_deref(),
            Some(orig_layer.as_str()),
            "query for 4 at version {at} after download",
        );
    }
 }
 #[test]
 fn missing_key_is_not_inserted_with_replace() {
    let mut map = BufferedHistoricLayerCoverage::new();
    let key = LayerKey {
        key: 0..5,
        lsn: 100..101,
        is_image: true,
    };
    let ret = map.replace(&key, "should not replace", |_| true);
    assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
    map.rebuild();
    assert!(map
        .get()
        .expect("no changes to rebuild")
        .get_version(102)
        .is_none());
 }
 #[test]
 fn replacing_buffered_insert_and_remove() {
    let mut map = BufferedHistoricLayerCoverage::new();
    let key = LayerKey {
        key: 0..5,
        lsn: 100..101,
        is_image: true,
    };
    map.insert(key.clone(), "Image 1");
    let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
    assert!(
        matches!(ret, Replacement::Replaced { in_buffered: true }),
        "{ret:?}"
    );
    map.rebuild();
    assert_eq!(
        map.get()
            .expect("rebuilt")
            .get_version(102)
            .unwrap()
            .image_coverage
            .query(4),
        Some("Remote Image 1")
    );
    map.remove(key.clone());
    let ret = map.replace(&key, "should not replace", |_| true);
    assert!(
        matches!(ret, Replacement::RemovalBuffered),
        "cannot replace after scheduled remove: {ret:?}"
    );
    map.rebuild();
    let ret = map.replace(&key, "should not replace", |_| true);
    assert!(
        matches!(ret, Replacement::NotFound),
        "cannot replace after remove + rebuild: {ret:?}"
    );
    let at_version = map.get().expect("rebuilt").get_version(102);
    assert!(at_version.is_none());
 }
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -0,0 +1,325 @@
 //! This module contains the encoding and decoding of the local manifest file.
 //!
 //! MANIFEST is a write-ahead log which is stored locally to each timeline. It
 //! records the state of the storage engine. It contains a snapshot of the
 //! state and all operations proceeding that snapshot. The file begins with a
 //! header recording MANIFEST version number. After that, it contains a snapshot.
 //! The snapshot is followed by a list of operations. Each operation is a list
 //! of records. Each record is either an addition or a removal of a layer.
 //!
 //! With MANIFEST, we can:
 //!
 //! 1. recover state quickly by reading the file, potentially boosting the
 //!    startup speed.
 //! 2. ensure all operations are atomic and avoid corruption, solving issues
 //!    like redundant image layer and preparing us for future compaction
 //!    strategies.
 //!
 //! There is also a format for storing all layer files on S3, called
 //! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
 //! records all operations as logs, and therefore we can easily replay the
 //! operations when recovering from crash, while ensuring those operations
 //! are atomic upon restart.
 //!
 //! Currently, this is not used in the system. Future refactors will ensure
 //! the storage state will be recorded in this file, and the system can be
 //! recovered from this file. This is tracked in
 //! https://github.com/neondatabase/neon/issues/4418
 use std::io::{self, Read, Write};
 use crate::virtual_file::VirtualFile;
 use anyhow::Result;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use crc32c::crc32c;
 use serde::{Deserialize, Serialize};
 use tracing::log::warn;
 use utils::lsn::Lsn;
 use super::storage_layer::PersistentLayerDesc;
 pub struct Manifest {
    file: VirtualFile,
 }
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub struct Snapshot {
    pub layers: Vec<PersistentLayerDesc>,
 }
 /// serde by default encode this in tagged enum, and therefore it will be something
 /// like `{ "AddLayer": { ... } }`.
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub enum Record {
    AddLayer(PersistentLayerDesc),
    RemoveLayer(PersistentLayerDesc),
 }
 /// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
 const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
 const MANIFEST_VERSION: u64 = 1;
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub struct ManifestHeader {
    magic_number: u64,
    version: u64,
 }
 const MANIFEST_HEADER_LEN: usize = 16;
 impl ManifestHeader {
    fn encode(&self) -> BytesMut {
        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
        buf.put_u64(self.magic_number);
        buf.put_u64(self.version);
        buf
    }
    fn decode(mut buf: &[u8]) -> Self {
        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
        Self {
            magic_number: buf.get_u64(),
            version: buf.get_u64(),
        }
    }
 }
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub enum Operation {
    /// A snapshot of the current state.
    ///
    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
    Snapshot(Snapshot, Lsn),
    /// An atomic operation that changes the state.
    ///
    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
    /// This will only change when new L0 is flushed to the disk.
    Operation(Vec<Record>, Lsn),
 }
 struct RecordHeader {
    size: u32,
    checksum: u32,
 }
 const RECORD_HEADER_LEN: usize = 8;
 impl RecordHeader {
    fn encode(&self) -> BytesMut {
        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
        buf.put_u32(self.size);
        buf.put_u32(self.checksum);
        buf
    }
    fn decode(mut buf: &[u8]) -> Self {
        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
        Self {
            size: buf.get_u32(),
            checksum: buf.get_u32(),
        }
    }
 }
 #[derive(Debug, thiserror::Error)]
 pub enum ManifestLoadError {
    #[error("manifest header is corrupted")]
    CorruptedManifestHeader,
    #[error("unsupported manifest version: got {0}, expected {1}")]
    UnsupportedVersion(u64, u64),
    #[error("error when decoding record: {0}")]
    DecodeRecord(serde_json::Error),
    #[error("I/O error: {0}")]
    Io(io::Error),
 }
 #[must_use = "Should check if the manifest is partially corrupted"]
 pub struct ManifestPartiallyCorrupted(bool);
 impl Manifest {
    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
        let mut manifest = Self { file };
        manifest.append_manifest_header(ManifestHeader {
            magic_number: MANIFEST_MAGIC_NUMBER,
            version: MANIFEST_VERSION,
        })?;
        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
        Ok(manifest)
    }
    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
    /// backup the current one.
    pub fn load(
        mut file: VirtualFile,
    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
        let mut buf = vec![];
        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
        // Read manifest header
        let mut buf = Bytes::from(buf);
        if buf.remaining() < MANIFEST_HEADER_LEN {
            return Err(ManifestLoadError::CorruptedManifestHeader);
        }
        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
        buf.advance(MANIFEST_HEADER_LEN);
        if header.version != MANIFEST_VERSION {
            return Err(ManifestLoadError::UnsupportedVersion(
                header.version,
                MANIFEST_VERSION,
            ));
        }
        // Read operations
        let mut operations = Vec::new();
        let corrupted = loop {
            if buf.remaining() == 0 {
                break false;
            }
            if buf.remaining() < RECORD_HEADER_LEN {
                warn!("incomplete header when decoding manifest, could be corrupted");
                break true;
            }
            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
            let size = size as usize;
            buf.advance(RECORD_HEADER_LEN);
            if buf.remaining() < size {
                warn!("incomplete data when decoding manifest, could be corrupted");
                break true;
            }
            let data = &buf[..size];
            if crc32c(data) != checksum {
                warn!("checksum mismatch when decoding manifest, could be corrupted");
                break true;
            }
            // if the following decode fails, we cannot use the manifest or safely ignore any record.
            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
            buf.advance(size);
        };
        Ok((
            Self { file },
            operations,
            ManifestPartiallyCorrupted(corrupted),
        ))
    }
    fn append_data(&mut self, data: &[u8]) -> Result<()> {
        if data.len() >= u32::MAX as usize {
            panic!("data too large");
        }
        let header = RecordHeader {
            size: data.len() as u32,
            checksum: crc32c(data),
        };
        let header = header.encode();
        self.file.write_all(&header)?;
        self.file.write_all(data)?;
        self.file.sync_all()?;
        Ok(())
    }
    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
        let encoded = header.encode();
        self.file.write_all(&encoded)?;
        Ok(())
    }
    /// Add an operation to the manifest. The operation will be appended to the end of the file,
    /// and the file will fsync.
    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
        let encoded = Vec::from(serde_json::to_string(&operation)?);
        self.append_data(&encoded)
    }
 }
 #[cfg(test)]
 mod tests {
    use std::fs::OpenOptions;
    use crate::repository::Key;
    use super::*;
    #[test]
    fn test_read_manifest() {
        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
        std::fs::create_dir_all(&testdir).unwrap();
        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
        // Write a manifest with a snapshot and some operations
        let snapshot = Snapshot {
            layers: vec![layer1, layer2],
        };
        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
        manifest
            .append_operation(Operation::Operation(
                vec![Record::AddLayer(layer3.clone())],
                Lsn::from(1),
            ))
            .unwrap();
        drop(manifest);
        // Open the second time and write
        let file = VirtualFile::open_with_options(
            &testdir.join("MANIFEST"),
            OpenOptions::new()
                .read(true)
                .write(true)
                .create_new(false)
                .truncate(false),
        )
        .unwrap();
        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
        assert!(!corrupted.0);
        assert_eq!(operations.len(), 2);
        assert_eq!(
            &operations[0],
            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
        );
        assert_eq!(
            &operations[1],
            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
        );
        manifest
            .append_operation(Operation::Operation(
                vec![
                    Record::RemoveLayer(layer3.clone()),
                    Record::AddLayer(layer4.clone()),
                ],
                Lsn::from(2),
            ))
            .unwrap();
        drop(manifest);
        // Open the third time and verify
        let file = VirtualFile::open_with_options(
            &testdir.join("MANIFEST"),
            OpenOptions::new()
                .read(true)
                .write(true)
                .create_new(false)
                .truncate(false),
        )
        .unwrap();
        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
        assert!(!corrupted.0);
        assert_eq!(operations.len(), 3);
        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
        assert_eq!(
            &operations[1],
            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
        );
        assert_eq!(
            &operations[2],
            &Operation::Operation(
                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
                Lsn::from(2)
            )
        );
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -396,7 +396,9 @@ pub async fn delete_timeline(
    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant.delete_timeline(timeline_id, ctx).await?;
+    tenant
        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
        .await?;
    Ok(())
 }
@@ -673,7 +675,7 @@ pub async fn immediate_gc(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
@@ -722,11 +724,11 @@ pub async fn immediate_compact(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -225,7 +227,9 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
-    Deleted,
+    Deleted(IndexPart),
 }
 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
        Ok(())
    }
    /// Initialize the queue in stopped state. Used in startup path
    /// to continue deletion operation interrupted by pageserver crash or restart.
    pub fn init_upload_queue_stopped_to_continue_deletion(
        &self,
        index_part: &IndexPart,
    ) -> anyhow::Result<()> {
        // FIXME: consider newtype for DeletedIndexPart.
        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;
        {
            let mut upload_queue = self.upload_queue.lock().unwrap();
            upload_queue.initialize_with_current_remote_index_part(index_part)?;
            self.update_remote_physical_size_gauge(Some(index_part));
        }
        // also locks upload queue, without dropping the guard above it will be a deadlock
        self.stop().expect("initialized line above");
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue
            .stopped_mut()
            .expect("stopped above")
            .deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
        Ok(())
    }
    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
+            UploadQueue::Stopped(q) => {
                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
            }
        }
    }
@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
        .await?;
        if index_part.deleted_at.is_some() {
-            Ok(MaybeDeletedIndexPart::Deleted)
+            Ok(MaybeDeletedIndexPart::Deleted(index_part))
        } else {
            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
        }
@@ -573,10 +608,7 @@ impl RemoteTimelineClient {
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
-        info!(
+        info!("scheduled layer file upload {layer_file_name}");
            "scheduled layer file upload {}",
            layer_file_name.file_name()
        );
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -622,10 +654,14 @@ impl RemoteTimelineClient {
            // schedule the actual deletions
            for name in names {
-                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
+                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: false,
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {}", name.file_name());
+                info!("scheduled layer file deletion {name}");
            }
            // Launch the tasks immediately, if possible
@@ -639,18 +675,11 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled uploads/deletions to complete
    ///
    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let (sender, mut receiver) = tokio::sync::watch::channel(());
+        let mut receiver = {
        let barrier_op = UploadOp::Barrier(sender);
        {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            upload_queue.queued_operations.push_back(barrier_op);
+            self.schedule_barrier(upload_queue)
-            // Don't count this kind of operation!
+        };
            // Launch the task immediately, if possible
            self.launch_queued_tasks(upload_queue);
        }
        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -658,6 +687,22 @@ impl RemoteTimelineClient {
        Ok(())
    }
    fn schedule_barrier(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
    ) -> tokio::sync::watch::Receiver<()> {
        let (sender, receiver) = tokio::sync::watch::channel(());
        let barrier_op = UploadOp::Barrier(sender);
        upload_queue.queued_operations.push_back(barrier_op);
        // Don't count this kind of operation!
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        receiver
    }
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -665,6 +710,7 @@ impl RemoteTimelineClient {
    /// The caller is responsible for calling `stop()` AND for waiting
    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
    /// Check method [`RemoteTimelineClient::stop`] for details.
    #[instrument(skip_all)]
    pub(crate) async fn persist_index_part_with_deleted_flag(
        self: &Arc<Self>,
    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -674,15 +720,7 @@ impl RemoteTimelineClient {
            // We must be in stopped state because otherwise
            // we can have inprogress index part upload that can overwrite the file
            // with missing is_deleted flag that we going to set below
-            let stopped = match &mut *locked {
+            let stopped = locked.stopped_mut()?;
                UploadQueue::Uninitialized => {
                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
                }
                UploadQueue::Initialized(_) => {
                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
                }
                UploadQueue::Stopped(stopped) => stopped,
            };
            match stopped.deleted_at {
                SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -696,48 +734,34 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
-            let mut index_part = IndexPart::new(
+            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
-                stopped.latest_files.clone(),
+                .context("IndexPart serialize")?;
                stopped.last_uploaded_consistent_lsn,
                stopped
                    .latest_metadata
                    .to_bytes()
                    .context("serialize metadata")?,
            );
            index_part.deleted_at = Some(deleted_at);
            index_part
        };
        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
            let mut locked = self_clone.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
+            let stopped = locked
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                .stopped_mut()
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
                    locked.as_str(),
                ),
                UploadQueue::Stopped(stopped) => stopped,
            };
            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
        });
        // Have a failpoint that can use the `pause` failpoint action.
        // We don't want to block the executor thread, hence, spawn_blocking + await.
-        #[cfg(feature = "testing")]
+        if cfg!(feature = "testing") {
-        tokio::task::spawn_blocking({
+            tokio::task::spawn_blocking({
-            let current = tracing::Span::current();
+                let current = tracing::Span::current();
-            move || {
+                move || {
-                let _entered = current.entered();
+                    let _entered = current.entered();
-                tracing::info!(
+                    tracing::info!("at failpoint persist_deleted_index_part");
-                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                    fail::fail_point!("persist_deleted_index_part");
-                );
+                }
-                fail::fail_point!(
+            })
-                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+            .await
-                );
+            .expect("spawn_blocking");
-            }
+        }
        })
        .await
        .expect("spawn_blocking");
        upload::upload_index_part(
            self.conf,
            &self.storage_impl,
@@ -751,13 +775,10 @@ impl RemoteTimelineClient {
        ScopeGuard::into_inner(undo_deleted_at);
        {
            let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
+
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+            let stopped = locked
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                .stopped_mut()
-                    locked.as_str(),
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
                ),
                UploadQueue::Stopped(stopped) => stopped,
            };
            stopped.deleted_at = SetDeletedFlagProgress::Successful(
                index_part_with_deleted_at
                    .deleted_at
@@ -768,6 +789,90 @@ impl RemoteTimelineClient {
        Ok(())
    }
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_and_timeline_id();
        let (mut receiver, deletions_queued) = {
            let mut deletions_queued = 0;
            let mut locked = self.upload_queue.lock().unwrap();
            let stopped = locked.stopped_mut()?;
            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
                anyhow::bail!("deleted_at is not set")
            }
            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
            stopped
                .upload_queue_for_deletion
                .queued_operations
                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
            // schedule the actual deletions
            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: true,
                });
                self.calls_unfinished_metric_begin(&op);
                stopped
                    .upload_queue_for_deletion
                    .queued_operations
                    .push_back(op);
                info!("scheduled layer file deletion {name}");
                deletions_queued += 1;
            }
            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
            (
                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
                deletions_queued,
            )
        };
        receiver.changed().await?;
        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
        let remaining = self
            .storage_impl
            .list_prefixes(Some(&timeline_storage_path))
            .await?;
        let remaining: Vec<RemotePath> = remaining
            .into_iter()
            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
            .collect();
        if !remaining.is_empty() {
            warn!(
                "Found {} files not bound to index_file.json, proceeding with their deletion",
                remaining.len()
            );
            warn!("About to remove {} files", remaining.len());
            self.storage_impl.delete_objects(&remaining).await?;
        }
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;
        info!(deletions_queued, "done deleting, including index_part.json");
        Ok(())
    }
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -786,7 +891,7 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
@@ -817,7 +922,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
@@ -891,7 +996,6 @@ impl RemoteTimelineClient {
                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                    }
                }
                self.calls_unfinished_metric_end(&task.op);
                return;
            }
@@ -937,16 +1041,16 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
+                UploadOp::Delete(delete) => {
                    let path = &self
                        .conf
                        .timeline_path(&self.timeline_id, &self.tenant_id)
-                        .join(layer_file_name.file_name());
+                        .join(delete.layer_file_name.file_name());
                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
-                            *metric_file_kind,
+                            delete.file_kind,
                            RemoteOpKind::Delete,
                            Arc::clone(&self.metrics),
                        )
@@ -1012,11 +1116,24 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => {
+                UploadQueue::Stopped(stopped) => {
                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
                    // then stop() took care of it so we just return.
                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
                    match &task.op {
                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
                        _ => None
                    }
                },
                UploadQueue::Initialized(qi) => { Some(qi) }
            };
            let upload_queue = match upload_queue {
                Some(upload_queue) => upload_queue,
                None => {
                    info!("another concurrent task already stopped the queue");
                    return;
-                }, // nothing to do
+                }
                UploadQueue::Initialized(qi) => { qi }
            };
            upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1029,7 +1146,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions -= 1;
                }
                UploadOp::Barrier(_) => unreachable!(),
@@ -1063,8 +1180,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(file_kind, _) => (
+            UploadOp::Delete(delete) => (
-                *file_kind,
+                delete.file_kind,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1111,32 +1228,36 @@ impl RemoteTimelineClient {
                info!("another concurrent task already shut down the queue");
                Ok(())
            }
-            UploadQueue::Initialized(UploadQueueInitialized {
+            UploadQueue::Initialized(initialized) => {
                latest_files,
                latest_metadata,
                last_uploaded_consistent_lsn,
                ..
            }) => {
                info!("shutting down upload queue");
                // Replace the queue with the Stopped state, taking ownership of the old
                // Initialized queue. We will do some checks on it, and then drop it.
                let qi = {
-                    // take or clone what we need
+                    // Here we preserve working version of the upload queue for possible use during deletions.
-                    let latest_files = std::mem::take(latest_files);
+                    // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
-                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
+                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
-                    // this could be Copy
+                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
-                    let latest_metadata = latest_metadata.clone();
+                    let upload_queue_for_deletion = UploadQueueInitialized {
-
+                        task_counter: 0,
-                    let stopped = UploadQueueStopped {
+                        latest_files: initialized.latest_files.clone(),
-                        latest_files,
+                        latest_files_changes_since_metadata_upload_scheduled: 0,
-                        last_uploaded_consistent_lsn,
+                        latest_metadata: initialized.latest_metadata.clone(),
-                        latest_metadata,
+                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
-                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                        num_inprogress_layer_uploads: 0,
                        num_inprogress_metadata_uploads: 0,
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                    };
-                    let upload_queue =
+                    let upload_queue = std::mem::replace(
-                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
+                        &mut *guard,
                        UploadQueue::Stopped(UploadQueueStopped {
                            upload_queue_for_deletion,
                            deleted_at: SetDeletedFlagProgress::NotRunning,
                        }),
                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
                    } else {
@@ -1144,8 +1265,6 @@ impl RemoteTimelineClient {
                    }
                };
                assert!(qi.latest_files.is_empty(), "do not use this anymore");
                // consistency check
                assert_eq!(
                    qi.num_inprogress_layer_uploads
@@ -1243,7 +1362,7 @@ mod tests {
    struct TestSetup {
        runtime: &'static tokio::runtime::Runtime,
        entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness<'static>,
+        harness: TenantHarness,
        tenant: Arc<Tenant>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
@@ -1264,7 +1383,12 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = runtime.block_on(tenant.create_test_timeline(
                TIMELINE_ID,
                Lsn(8),
                DEFAULT_PG_VERSION,
                &ctx,
            ))?;
            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1408,7 +1532,7 @@ mod tests {
        // Download back the index.json, and check that the list of files is correct
        let index_part = match runtime.block_on(client.download_index_file())? {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
        assert_file_list(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::bin_ser::SerializeError;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use utils::lsn::Lsn;
@@ -115,6 +117,21 @@ impl IndexPart {
    }
 }
 impl TryFrom<&UploadQueueInitialized> for IndexPart {
    type Error = SerializeError;
    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
            metadata_bytes,
        ))
    }
 }
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -176,13 +176,10 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn for_loading_layer<L>(
+    pub(crate) fn for_loading_layer(
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        status: LayerResidenceStatus,
-    ) -> Self
+    ) -> Self {
    where
        L: ?Sized + Layer,
    {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
        new.record_residence_event(
            layer_map_lock_held_witness,
@@ -197,14 +194,11 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn clone_for_residence_change<L>(
+    pub(crate) fn clone_for_residence_change(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats
+    ) -> LayerAccessStats {
    where
        L: ?Sized + Layer,
    {
        let clone = {
            let inner = self.0.lock().unwrap();
            inner.clone()
@@ -232,14 +226,12 @@ impl LayerAccessStats {
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
    ///
-    pub(crate) fn record_residence_event<L>(
+    pub(crate) fn record_residence_event(
        &self,
-        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_>,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
-    ) where
+    ) {
        L: ?Sized + Layer,
    {
        let mut locked = self.0.lock().unwrap();
        locked.iter_mut().for_each(|inner| {
            inner
@@ -343,7 +335,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + Send + Sync {
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;
@@ -381,18 +373,15 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
        ctx: &RequestContext,
    ) -> Result<ValueReconstructResult>;
    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
    fn short_id(&self) -> String;
    /// Dump summary of the contents of the layer to stdout
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }
 /// Returned by [`Layer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
 /// Returned by [`Layer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
@@ -473,94 +462,127 @@ pub fn downcast_remote_layer(
    }
 }
-/// Holds metadata about a layer without any content. Used mostly for testing.
+pub mod tests {
-///
+    use super::*;
 /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
 /// LayerDescriptor.
 #[derive(Clone, Debug)]
 pub struct LayerDescriptor {
    pub key: Range<Key>,
    pub lsn: Range<Lsn>,
    pub is_incremental: bool,
    pub short_id: String,
 }
-impl LayerDescriptor {
+    /// Holds metadata about a layer without any content. Used mostly for testing.
-    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
+    ///
-    /// and the tenant / timeline id does not matter.
+    /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
-    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
+    /// LayerDescriptor.
-        PersistentLayerDesc::new_delta(
+    #[derive(Clone, Debug)]
-            TenantId::from_array([0; 16]),
+    pub struct LayerDescriptor {
-            TimelineId::from_array([0; 16]),
+        base: PersistentLayerDesc,
            self.key.clone(),
            self.lsn.clone(),
            233,
        )
    }
 }
 impl Layer for LayerDescriptor {
    fn get_key_range(&self) -> Range<Key> {
        self.key.clone()
    }
-    fn get_lsn_range(&self) -> Range<Lsn> {
+    impl From<PersistentLayerDesc> for LayerDescriptor {
-        self.lsn.clone()
+        fn from(base: PersistentLayerDesc) -> Self {
-    }
+            Self { base }
    fn is_incremental(&self) -> bool {
        self.is_incremental
    }
    fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
        _reconstruct_data: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
        todo!("This method shouldn't be part of the Layer trait")
    }
    fn short_id(&self) -> String {
        self.short_id.clone()
    }
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        todo!()
    }
 }
 impl From<DeltaFileName> for LayerDescriptor {
    fn from(value: DeltaFileName) -> Self {
        let short_id = value.to_string();
        LayerDescriptor {
            key: value.key_range,
            lsn: value.lsn_range,
            is_incremental: true,
            short_id,
        }
    }
 }
-impl From<ImageFileName> for LayerDescriptor {
+    impl Layer for LayerDescriptor {
-    fn from(value: ImageFileName) -> Self {
+        fn get_value_reconstruct_data(
-        let short_id = value.to_string();
+            &self,
-        let lsn = value.lsn_as_range();
+            _key: Key,
-        LayerDescriptor {
+            _lsn_range: Range<Lsn>,
-            key: value.key_range,
+            _reconstruct_data: &mut ValueReconstructState,
-            lsn,
+            _ctx: &RequestContext,
-            is_incremental: false,
+        ) -> Result<ValueReconstructResult> {
-            short_id,
+            todo!("This method shouldn't be part of the Layer trait")
        }
        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
            todo!()
        }
        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
        fn get_key_range(&self) -> Range<Key> {
            self.layer_desc().key_range.clone()
        }
        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
        fn get_lsn_range(&self) -> Range<Lsn> {
            self.layer_desc().lsn_range.clone()
        }
        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
        fn is_incremental(&self) -> bool {
            self.layer_desc().is_incremental
        }
    }
 }
-impl From<LayerFileName> for LayerDescriptor {
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn from(value: LayerFileName) -> Self {
+    impl std::fmt::Display for LayerDescriptor {
-        match value {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            LayerFileName::Delta(d) => Self::from(d),
+            write!(f, "{}", self.layer_desc().short_id())
-            LayerFileName::Image(i) => Self::from(i),
+        }
    }
    impl PersistentLayer for LayerDescriptor {
        fn layer_desc(&self) -> &PersistentLayerDesc {
            &self.base
        }
        fn local_path(&self) -> Option<PathBuf> {
            unimplemented!()
        }
        fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
            unimplemented!()
        }
        fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
            unimplemented!()
        }
        fn delete_resident_layer_file(&self) -> Result<()> {
            unimplemented!()
        }
        fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
            unimplemented!()
        }
        fn access_stats(&self) -> &LayerAccessStats {
            unimplemented!()
        }
    }
    impl From<DeltaFileName> for LayerDescriptor {
        fn from(value: DeltaFileName) -> Self {
            LayerDescriptor {
                base: PersistentLayerDesc::new_delta(
                    TenantId::from_array([0; 16]),
                    TimelineId::from_array([0; 16]),
                    value.key_range,
                    value.lsn_range,
                    233,
                ),
            }
        }
    }
    impl From<ImageFileName> for LayerDescriptor {
        fn from(value: ImageFileName) -> Self {
            LayerDescriptor {
                base: PersistentLayerDesc::new_img(
                    TenantId::from_array([0; 16]),
                    TimelineId::from_array([0; 16]),
                    value.key_range,
                    value.lsn,
                    false,
                    233,
                ),
            }
        }
    }
    impl From<LayerFileName> for LayerDescriptor {
        fn from(value: LayerFileName) -> Self {
            match value {
                LayerFileName::Delta(d) => Self::from(d),
                LayerFileName::Image(i) => Self::from(i),
            }
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;
 use utils::{
@@ -184,7 +184,7 @@ pub struct DeltaLayer {
    access_stats: LayerAccessStats,
-    inner: RwLock<DeltaLayerInner>,
+    inner: OnceCell<DeltaLayerInner>,
 }
 impl std::fmt::Debug for DeltaLayer {
@@ -201,21 +201,17 @@ impl std::fmt::Debug for DeltaLayer {
 }
 pub struct DeltaLayerInner {
    /// If false, the fields below have not been loaded into memory yet.
    loaded: bool,
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
-    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    /// Reader object for reading blocks from the file.
-    file: Option<FileBlockReader<VirtualFile>>,
+    file: FileBlockReader<VirtualFile>,
 }
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -246,7 +242,7 @@ impl Layer for DeltaLayer {
            inner.index_start_blk, inner.index_root_blk
        );
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -315,7 +311,7 @@ impl Layer for DeltaLayer {
            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
            // Scan the page versions backwards, starting from `lsn`.
-            let file = inner.file.as_ref().unwrap();
+            let file = &inner.file;
            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
@@ -398,10 +394,11 @@ impl Layer for DeltaLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-
+}
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
+impl std::fmt::Display for DeltaLayer {
-        self.layer_desc().short_id()
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.layer_desc().short_id())
    }
 }
@@ -500,51 +497,22 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
        &self,
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<RwLockReadGuard<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        loop {
+        // Quick exit if already loaded
-            // Quick exit if already loaded
+        self.inner
-            let inner = self.inner.read().unwrap();
+            .get_or_try_init(|| self.load_inner())
-            if inner.loaded {
+            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
                return Ok(inner);
            }
            // Need to open the file and load the metadata. Upgrade our lock to
            // a write lock. (Or rather, release and re-lock in write mode.)
            drop(inner);
            let inner = self.inner.write().unwrap();
            if !inner.loaded {
                self.load_inner(inner).with_context(|| {
                    format!("Failed to load delta layer {}", self.path().display())
                })?;
            } else {
                // Another thread loaded it while we were not holding the lock.
            }
            // We now have the file open and loaded. There's no function to do
            // that in the std library RwLock, so we have to release and re-lock
            // in read mode. (To be precise, the lock guard was moved in the
            // above call to `load_inner`, so it's already been released). And
            // while we do that, another thread could unload again, so we have
            // to re-check and retry if that happens.
        }
    }
-    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();
-        // Open the file if it's not open already.
+        let file = VirtualFile::open(&path)
-        if inner.file.is_none() {
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            let file = VirtualFile::open(&path)
+        let file = FileBlockReader::new(file);
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+
            inner.file = Some(FileBlockReader::new(file));
        }
        let file = inner.file.as_mut().unwrap();
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -571,13 +539,13 @@ impl DeltaLayer {
            }
        }
        inner.index_start_blk = actual_summary.index_start_blk;
        inner.index_root_blk = actual_summary.index_root_blk;
        debug!("loaded from {}", &path.display());
-        inner.loaded = true;
+        Ok(DeltaLayerInner {
-        Ok(())
+            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
        })
    }
    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -599,12 +567,7 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: RwLock::new(DeltaLayerInner {
+            inner: once_cell::sync::OnceCell::new(),
                loaded: false,
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
            }),
        }
    }
@@ -631,12 +594,7 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
+            inner: once_cell::sync::OnceCell::new(),
                loaded: false,
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
            }),
        })
    }
@@ -800,12 +758,7 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
+            inner: once_cell::sync::OnceCell::new(),
                loaded: false,
                file: None,
                index_start_blk,
                index_root_blk,
            }),
        };
        // fsync the file
@@ -940,13 +893,13 @@ struct DeltaValueIter<'a> {
    reader: BlockCursor<Adapter<'a>>,
 }
-struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
+struct Adapter<'a>(&'a DeltaLayerInner);
 impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;
    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.as_ref().unwrap().read_blk(blknum)
+        self.0.file.read_blk(blknum)
    }
 }
@@ -959,8 +912,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
 }
 impl<'a> DeltaValueIter<'a> {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1033,8 +986,8 @@ impl Iterator for DeltaKeyIter {
 }
 impl<'a> DeltaKeyIter {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1074,3 +1027,21 @@ impl<'a> DeltaKeyIter {
        Ok(iter)
    }
 }
 #[cfg(test)]
 mod test {
    use super::DeltaKeyIter;
    use super::DeltaLayer;
    use super::DeltaValueIter;
    // We will soon need the iters to be send in the compaction code.
    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
    // Cf https://github.com/neondatabase/neon/issues/4471
    #[test]
    fn is_send() {
        fn assert_send<T: Send>() {}
        assert_send::<DeltaLayer>();
        assert_send::<DeltaValueIter>();
        assert_send::<DeltaKeyIter>();
    }
 }
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -210,9 +210,15 @@ pub enum LayerFileName {
 impl LayerFileName {
    pub fn file_name(&self) -> String {
        self.to_string()
    }
 }
 impl fmt::Display for LayerFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            Self::Image(fname) => fname.to_string(),
+            Self::Image(fname) => write!(f, "{fname}"),
-            Self::Delta(fname) => fname.to_string(),
+            Self::Delta(fname) => write!(f, "{fname}"),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -230,10 +230,12 @@ impl Layer for ImageLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
 }
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
+impl std::fmt::Display for ImageLayer {
-        self.layer_desc().short_id()
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.layer_desc().short_id())
    }
 }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -131,13 +131,6 @@ impl Layer for InMemoryLayer {
        true
    }
    fn short_id(&self) -> String {
        let inner = self.inner.read().unwrap();
        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
        format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();
@@ -240,6 +233,15 @@ impl Layer for InMemoryLayer {
    }
 }
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let inner = self.inner.read().unwrap();
        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }
 impl InMemoryLayer {
    ///
    /// Get layer size on the disk
@@ -304,7 +306,7 @@ impl InMemoryLayer {
        Ok(())
    }
-    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,5 @@
 use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
    id::{TenantId, TimelineId},
@@ -9,10 +10,12 @@ use crate::{context::RequestContext, repository::Key};
 use super::{DeltaFileName, ImageFileName, LayerFileName};
 use serde::{Deserialize, Serialize};
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -46,8 +49,21 @@ impl PersistentLayerDesc {
        }
    }
-    pub fn short_id(&self) -> String {
+    pub fn short_id(&self) -> impl Display {
-        self.filename().file_name()
+        self.filename()
    }
    #[cfg(test)]
    pub fn new_test(key_range: Range<Key>) -> Self {
        Self {
            tenant_id: TenantId::generate(),
            timeline_id: TimelineId::generate(),
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
            is_delta: false,
            is_incremental: false,
            file_size: 0,
        }
    }
    pub fn new_img(
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -71,10 +71,7 @@ impl Layer for RemoteLayer {
        _reconstruct_state: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
-        bail!(
+        bail!("layer {self} needs to be downloaded");
            "layer {} needs to be downloaded",
            self.filename().file_name()
        );
    }
    /// debugging function to print out the contents of the layer
@@ -106,10 +103,12 @@ impl Layer for RemoteLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
 }
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
+impl std::fmt::Display for RemoteLayer {
-        self.layer_desc().short_id()
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.layer_desc().short_id())
    }
 }
@@ -218,15 +217,12 @@ impl RemoteLayer {
    }
    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer<L>(
+    pub fn create_downloaded_layer(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        conf: &'static PageServerConf,
        file_size: u64,
-    ) -> Arc<dyn PersistentLayer>
+    ) -> Arc<dyn PersistentLayer> {
    where
        L: ?Sized + Layer,
    {
        if self.desc.is_delta {
            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -70,7 +70,6 @@ impl Timeline {
                };
                self_clone.eviction_task(cancel).await;
                info!("eviction task finishing");
                Ok(())
            },
        );
@@ -78,6 +77,9 @@ impl Timeline {
    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
        scopeguard::defer! {
            info!("eviction task finishing");
        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
@@ -86,7 +88,6 @@ impl Timeline {
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
            if random_init_delay(period, &cancel).await.is_err() {
                info!("shutting down");
                return;
            }
        }
@@ -101,7 +102,6 @@ impl Timeline {
                ControlFlow::Continue(sleep_until) => {
                    tokio::select! {
                        _ = cancel.cancelled() => {
                            info!("shutting down");
                            break;
                        }
                        _ = tokio::time::sleep_until(sleep_until) => { }
@@ -197,9 +197,11 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().unwrap();
+            let guard = self.layers.read().await;
            let (layers, mapping) = &*guard;
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = mapping.get_from_desc(&hist_layer);
                if hist_layer.is_remote_layer() {
                    continue;
                }
@@ -207,7 +209,7 @@ impl Timeline {
                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now");
+                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
                    SystemTime::now()
                });
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -153,7 +153,7 @@ pub(super) async fn connection_manager_loop_step(
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping => {
+                                TimelineState::Broken { .. } | TimelineState::Stopping => {
                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                    return ControlFlow::Break(());
                                }
@@ -1321,10 +1321,11 @@ mod tests {
    const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
-    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
+    async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
            .await
            .expect("Failed to create an empty timeline for dummy wal connection manager");
        ConnectionManagerState {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -71,6 +71,8 @@ pub(super) async fn handle_walreceiver_connection(
    ctx: RequestContext,
    node: NodeId,
 ) -> anyhow::Result<()> {
    debug_assert_current_span_has_tenant_and_timeline_id();
    WALRECEIVER_STARTED_CONNECTIONS.inc();
    // Connect to the database in replication mode.
@@ -140,6 +142,9 @@ pub(super) async fn handle_walreceiver_connection(
            }
            Ok(())
        }
        // Enrich the log lines emitted by this closure with meaningful context.
        // TODO: technically, this task outlives the surrounding function, so, the
        // spans won't be properly nested.
        .instrument(tracing::info_span!("poller")),
    );
@@ -304,12 +309,15 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }
-        timeline.check_checkpoint_distance().with_context(|| {
+        timeline
-            format!(
+            .check_checkpoint_distance()
-                "Failed to check checkpoint distance for timeline {}",
+            .await
-                timeline.timeline_id
+            .with_context(|| {
-            )
+                format!(
-        })?;
+                    "Failed to check checkpoint distance for timeline {}",
                    timeline.timeline_id
                )
            })?;
        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn =
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }
 impl UploadQueueInitialized {
    pub(super) fn no_pending_work(&self) -> bool {
        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
    }
 }
 #[derive(Clone, Copy)]
 pub(super) enum SetDeletedFlagProgress {
    NotRunning,
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
 }
 pub(super) struct UploadQueueStopped {
-    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
    pub(super) last_uploaded_consistent_lsn: Lsn,
    pub(super) latest_metadata: TimelineMetadata,
    pub(super) deleted_at: SetDeletedFlagProgress,
 }
@@ -187,6 +191,15 @@ impl UploadQueue {
            UploadQueue::Initialized(x) => Ok(x),
        }
    }
    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
        match self {
            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
                anyhow::bail!("queue is in state {}", self.as_str())
            }
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
 }
 /// An in-progress upload or delete task.
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }
 #[derive(Debug)]
 pub(crate) struct Delete {
    pub(crate) file_kind: RemoteOpFileKind,
    pub(crate) layer_file_name: LayerFileName,
    pub(crate) scheduled_from_timeline_delete: bool,
 }
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
-    /// Delete a file.
+    /// Delete a layer file
-    Delete(RemoteOpFileKind, LayerFileName),
+    Delete(Delete),
    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
                )
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Delete(delete) => write!(
                f,
                "Delete(path: {}, scheduled_from_timeline_delete: {})",
                delete.layer_file_name.file_name(),
                delete.scheduled_from_timeline_delete
            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -302,15 +302,6 @@ impl VirtualFile {
            .observe_closure_duration(|| self.open_options.open(&self.path))?;
        // Perform the requested operation on it
        //
        // TODO: We could downgrade the locks to read mode before calling
        // 'func', to allow a little bit more concurrency, but the standard
        // library RwLock doesn't allow downgrading without releasing the lock,
        // and that doesn't seem worth the trouble.
        //
        // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and
        // may deadlock on subsequent read calls.
        // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly.
        let result = STORAGE_IO_TIME
            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| func(&file));
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
-use anyhow::Result;
+use anyhow::{Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        modification.commit().await?;
        Ok(())
    }
@@ -1082,7 +1082,10 @@ impl<'a> WalIngest<'a> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0, ctx).await?;
+            modification
                .put_rel_creation(rel, 0, ctx)
                .await
                .context("Relation Error")?;
            0
        } else {
            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
@@ -1171,7 +1174,6 @@ impl<'a> WalIngest<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::tenant::harness::*;
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1200,7 +1202,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit()?;
+        m.commit().await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
        Ok(walingest)
@@ -1209,7 +1211,9 @@ mod tests {
    #[tokio::test]
    async fn test_relsize() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x20));
@@ -1217,22 +1221,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_current_logical_size(&tline, Lsn(0x50));
@@ -1318,7 +1322,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_current_logical_size(&tline, Lsn(0x60));
        // Check reported size and contents after truncation
@@ -1360,7 +1364,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1373,7 +1377,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1398,7 +1402,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1428,14 +1432,16 @@ mod tests {
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        // Check that rel exists and size is correct
        assert_eq!(
@@ -1454,7 +1460,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit()?;
+        m.commit().await?;
        // Check that rel is not visible anymore
        assert_eq!(
@@ -1472,7 +1478,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        // Check that rel exists and size is correct
        assert_eq!(
@@ -1497,7 +1503,9 @@ mod tests {
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        // Create a 20 MB relation (the size is arbitrary)
@@ -1509,7 +1517,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1554,7 +1562,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        // Check reported size and contents after truncation
        assert_eq!(
@@ -1603,7 +1611,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
@@ -1637,7 +1645,9 @@ mod tests {
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        let mut lsn = 0x10;
@@ -1648,7 +1658,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
        }
        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1664,7 +1674,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1677,7 +1687,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1693,7 +1703,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -122,6 +122,43 @@ hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
 						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
 }
 #ifdef __APPLE__
 #include <sys/types.h>
 #include <sys/sysctl.h>
 static void
 hnsw_check_available_memory(Size requested)
 {
 	size_t total;
 	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
 		elog(ERROR, "Failed to get amount of RAM: %m");
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
 		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
 			requested, total - (Size)NBuffers*BLCKSZ);
 }
 #else
 #include <sys/sysinfo.h>
 static void
 hnsw_check_available_memory(Size requested)
 {
 	struct sysinfo si;
 	Size total;
 	if (sysinfo(&si) < 0)
 		elog(ERROR, "Failed to get amount of RAM: %n");
 	total = si.totalram*si.mem_unit;
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
 		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
 			requested, total - (Size)NBuffers*BLCKSZ);
 }
 #endif
 static HierarchicalNSW*
 hnsw_get_index(Relation indexRel, Relation heapRel)
 {
@@ -156,6 +193,8 @@ hnsw_get_index(Relation indexRel, Relation heapRel)
 		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
 		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
 		hnsw_check_available_memory(shmem_size);
 		/* first try to attach to existed index */
 		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
 						 &mapped_address, &mapped_size, DEBUG1))
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,4 +1,4 @@
-comment = 'hNsw index'
+comment = 'hnsw index'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -32,6 +32,7 @@
 #include "port.h"
 #include <curl/curl.h>
 #include "utils/jsonb.h"
 #include "libpq/crypt.h"
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
@@ -161,7 +162,22 @@ ConstructDeltaMessage()
 			PushKeyValue(&state, "name", entry->name);
 			if (entry->password)
 			{
 #if PG_MAJORVERSION_NUM == 14
 				char	   *logdetail;
 #else
 				const char *logdetail;
 #endif
 				PushKeyValue(&state, "password", (char *) entry->password);
 				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
 				if (encrypted_password)
 				{
 					PushKeyValue(&state, "encrypted_password", encrypted_password);
 				}
 				else
 				{
 					elog(ERROR, "Failed to get encrypted password: %s", logdetail);
 				}
 			}
 			if (entry->old_name[0] != '\0')
 			{
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -190,7 +190,7 @@ lfc_change_limit_hook(int newval, void *extra)
 		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
 		lfc_ctl->used -= 1;
 	}
-	elog(LOG, "set local file cache limit to %d", new_size);
+	elog(DEBUG1, "set local file cache limit to %d", new_size);
 	LWLockRelease(lfc_lock);
 }
--- a/pgxn/neon/neon--1.0.sql
+++ b/pgxn/neon/neon--1.0.sql
@@ -32,3 +32,7 @@ CREATE VIEW local_cache AS
 	SELECT P.* FROM local_cache_pages() AS P
 	(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
 	 relforknumber int2, relblocknumber int8, accesscount int4);
 CREATE FUNCTION copy_from(conninfo cstring) RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'copy_from'
 LANGUAGE C;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -13,20 +13,32 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/relation.h"
 #include "access/xloginsert.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
 #include "catalog/namespace.h"
 #include "replication/walsender.h"
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/wait_event.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
 #include "utils/builtins.h"
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
 #include "control_plane_connector.h"
 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
 #include "libpq/libpq.h"
 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -46,6 +58,7 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
 PG_FUNCTION_INFO_V1(copy_from);
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -91,3 +104,281 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
 #define N_RAW_PAGE_COLUMNS 4
 #define COPY_FETCH_COUNT   16
 static void
 report_error(int elevel, PGresult *res, PGconn *conn,
 			 bool clear, const char *sql)
 {
 	/* If requested, PGresult must be released before leaving this function. */
 	PG_TRY();
 	{
 		char	   *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
 		char	   *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY);
 		char	   *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL);
 		char	   *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT);
 		char	   *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT);
 		int			sqlstate;
 		if (diag_sqlstate)
 			sqlstate = MAKE_SQLSTATE(diag_sqlstate[0],
 									 diag_sqlstate[1],
 									 diag_sqlstate[2],
 									 diag_sqlstate[3],
 									 diag_sqlstate[4]);
 		else
 			sqlstate = ERRCODE_CONNECTION_FAILURE;
 		/*
 		 * If we don't get a message from the PGresult, try the PGconn.  This
 		 * is needed because for connection-level failures, PQexec may just
 		 * return NULL, not a PGresult at all.
 		 */
 		if (message_primary == NULL)
 			message_primary = pchomp(PQerrorMessage(conn));
 		ereport(elevel,
 				(errcode(sqlstate),
 				 (message_primary != NULL && message_primary[0] != '\0') ?
 				 errmsg_internal("%s", message_primary) :
 				 errmsg("could not obtain message string for remote error"),
 				 message_detail ? errdetail_internal("%s", message_detail) : 0,
 				 message_hint ? errhint("%s", message_hint) : 0,
 				 message_context ? errcontext("%s", message_context) : 0,
 				 sql ? errcontext("remote SQL command: %s", sql) : 0));
 	}
 	PG_FINALLY();
 	{
 		if (clear)
 			PQclear(res);
 	}
 	PG_END_TRY();
 }
 static PGresult *
 get_result(PGconn *conn, const char *query)
 {
 	PGresult   *volatile last_res = NULL;
 	/* In what follows, do not leak any PGresults on an error. */
 	PG_TRY();
 	{
 		for (;;)
 		{
 			PGresult   *res;
 			while (PQisBusy(conn))
 			{
 				int			wc;
 				/* Sleep until there's something to do */
 				wc = WaitLatchOrSocket(MyLatch,
 									   WL_LATCH_SET | WL_SOCKET_READABLE |
 									   WL_EXIT_ON_PM_DEATH,
 									   PQsocket(conn),
 									   -1L, PG_WAIT_EXTENSION);
 				ResetLatch(MyLatch);
 				CHECK_FOR_INTERRUPTS();
 				/* Data available in socket? */
 				if (wc & WL_SOCKET_READABLE)
 				{
 					if (!PQconsumeInput(conn))
 						report_error(ERROR, NULL, conn, false, query);
 				}
 			}
 			res = PQgetResult(conn);
 			if (res == NULL)
 				break;			/* query is complete */
 			PQclear(last_res);
 			last_res = res;
 		}
 	}
 	PG_CATCH();
 	{
 		PQclear(last_res);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
 	return last_res;
 }
 #define CREATE_COPYDATA_FUNC "\
 create or replace function copydata() returns setof record as $$ \
 declare \
    relsize integer; \
    total_relsize integer; \
 	content bytea; \
 	r record; \
 	fork text; \
 	relname text; \
 	pagesize integer; \
 begin \
 	pagesize = current_setting('block_size'); \
 	for r in select oid,reltoastrelid from pg_class where relnamespace not in (select oid from pg_namespace where nspname in ('pg_catalog','pg_toast','information_schema')) \
 	loop \
 		relname = r.oid::regclass::text; \
        total_relsize = 0; \
 	    foreach fork in array array['main','vm','fsm'] \
 		loop \
 		    relsize = pg_relation_size(r.oid, fork)/pagesize; \
 			total_relsize = total_relsize + relsize; \
 	        for p in 1..relsize \
 		    loop \
 			    content = get_raw_page(relname, fork, p-1); \
 				return next row(relname,fork,p-1,content); \
 			end loop; \
 		end loop; \
        if total_relsize <> 0 and r.reltoastrelid <> 0 then \
            foreach relname in array array ['pg_toast.pg_toast_'||r.oid, 'pg_toast.pg_toast_'||r.oid||'_index'] \
 			loop \
 		    	foreach fork in array array['main','vm','fsm'] \
 				loop \
 			    	relsize = pg_relation_size(relname, fork)/pagesize; \
 	        		for p in 1..relsize \
 		    		loop \
 			    		content = get_raw_page(relname, fork, p-1); \
 						return next row(relname,fork,p-1,content); \
 					end loop; \
 				end loop; \
 			end loop; \
        end if; \
 	end loop; \
 end; \
 $$ language plpgsql"
 Datum
 copy_from(PG_FUNCTION_ARGS)
 {
 	char const* conninfo = PG_GETARG_CSTRING(0);
 	PGconn* conn;
 	char const* declare_cursor = "declare copy_data_cursor no scroll cursor for select * from copydata() as raw_page(relid text, fork text, blkno integer, content bytea)";
 	char* fetch_cursor = psprintf("fetch forward %d copy_data_cursor", COPY_FETCH_COUNT);
 	char const* close_cursor = "close copy_data_cursor";
 	char const* vacuum_freeze = "vacuum freeze";
 	char   *content;
 	char const* relname;
 	BlockNumber blkno;
 	ForkNumber forknum;
 	BlockNumber prev_blkno = InvalidBlockNumber;
 	RangeVar   *relrv;
 	Relation rel = NULL;
 	BlockNumber rel_size;
 	int64_t total = 0;
 	PGresult   *res;
 	char blkno_buf[4];
 	int n_tuples;
 	Buffer buf;
 	char* toast_rel_name;
 	Oid relid = InvalidOid;
 	/* Connect to the source database */
 	conn = PQconnectdb(conninfo);
 	if (!conn || PQstatus(conn) != CONNECTION_OK)
 		ereport(ERROR,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg("could not connect to server \"%s\"",
 						conninfo),
 				 errdetail_internal("%s", pchomp(PQerrorMessage(conn)))));
 	/* First create store procedure (assumes that pageinspector extension is already installed) */
 	res = PQexec(conn, CREATE_COPYDATA_FUNC);
 	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
 		report_error(ERROR, res, conn, true, CREATE_COPYDATA_FUNC);
 	PQclear(res);
 	/* Freeze all tables to prevent problems with XID mapping */
 	res = PQexec(conn, vacuum_freeze);
 	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
 		report_error(ERROR, res, conn, true, vacuum_freeze);
 	PQclear(res);
 	/* Start transaction to use cursor */
 	res = PQexec(conn, "BEGIN");
 	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
 		report_error(ERROR, res, conn, true, "BEGIN");
 	PQclear(res);
 	/* Declare cursor (we have to use cursor to avoid materializing all database in memory) */
 	res = PQexec(conn, declare_cursor);
 	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
 		report_error(ERROR, res, conn, true, declare_cursor);
 	PQclear(res);
 	/* Get database data */
 	while ((res = PQexecParams(conn, fetch_cursor, 0, NULL, NULL, NULL, NULL, 1)) != NULL)
 	{
 		if (PQresultStatus(res) != PGRES_TUPLES_OK)
 			report_error(ERROR, res, conn, true, fetch_cursor);
 		n_tuples = PQntuples(res);
 		if (PQnfields(res) != 4)
 			elog(ERROR, "unexpected result from copydata()");
 		for (int i = 0; i < n_tuples; i++)
 		{
 			relname = PQgetvalue(res, i, 0);
 			forknum = forkname_to_number(PQgetvalue(res, i, 1));
 			memcpy(&blkno, PQgetvalue(res, i, 2), sizeof(BlockNumber));
 			blkno = pg_ntoh32(blkno);
 			content = (char*)PQgetvalue(res, i, 3);
 			if (blkno <= prev_blkno)
 			{
 				if (forknum == MAIN_FORKNUM)
 				{
 					char* dst_rel_name = strncmp(relname, "pg_toast.", 9) == 0
 						/* Construct correct TOAST table name */
 						? psprintf("pg_toast.pg_toast_%u%s",
 								   relid,
 								   strcmp(relname + strlen(relname) - 5, "index") == 0 ? "_index" : "")
 						: (char*)relname;
 					if (rel)
 						relation_close(rel, AccessExclusiveLock);
 					relrv = makeRangeVarFromNameList(textToQualifiedNameList(cstring_to_text(dst_rel_name)));
 					rel = relation_openrv(relrv, AccessExclusiveLock);
 					if (dst_rel_name != relname)
 						pfree(dst_rel_name);
 					else
 						relid = RelationGetRelid(rel);
 				}
 				rel_size = RelationGetNumberOfBlocksInFork(rel, forknum);
 			}
 			buf = ReadBufferExtended(rel, forknum, blkno < rel_size ? blkno : P_NEW, RBM_ZERO_AND_LOCK, NULL);
 			MarkBufferDirty(buf);
 			memcpy(BufferGetPage(buf), content, BLCKSZ);
 			log_newpage_buffer(buf, forknum == MAIN_FORKNUM);
 			UnlockReleaseBuffer(buf);
 			total += 1;
 			prev_blkno = blkno;
 		}
 		PQclear(res);
 		if (n_tuples < COPY_FETCH_COUNT)
 			break;
 	}
 	res = PQexec(conn, close_cursor);
 	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
 		report_error(ERROR, res, conn, true, close_cursor);
 	PQclear(res);
 	if (rel)
 		relation_close(rel, AccessExclusiveLock);
 	/* Complete transaction */
 	res = PQexec(conn, "END");
 	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
 		report_error(ERROR, res, conn, true, "END");
 	PQclear(res);
 	PQfinish(conn);
 	PG_RETURN_INT64(total);
 }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -2675,7 +2675,6 @@ bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
 	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
 	RelFileNode	rnode;
 	ForkNumber	forknum;
 	BlockNumber	blkno;
@@ -2719,16 +2718,15 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	no_redo_needed = buffer < 0;
-	/* we don't have the buffer in memory, update lwLsn past this record */
+	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
 	/* we don't have the buffer in memory, update lwLsn past this record,
 	 * also evict page fro file cache
 	 */
 	if (no_redo_needed)
 	{
 		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
 		lfc_evict(rnode, forknum, blkno);
-	}
+
 	else
 	{
 		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
 	}
 	LWLockRelease(partitionLock);
@@ -2736,7 +2734,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	if (get_cached_relsize(rnode, forknum, &relsize))
 	{
 		if (relsize < blkno + 1)
 		{
 			update_cached_relsize(rnode, forknum, blkno + 1);
 			SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
 		}
 	}
 	else
 	{
@@ -2768,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		Assert(nbresponse->n_blocks > blkno);
 		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
 		SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
 		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
 	}
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -257,7 +257,7 @@ nwp_register_gucs(void)
 							"Walproposer reconnects to offline safekeepers once in this interval.",
 							NULL,
 							&wal_acceptor_reconnect_timeout,
-							5000, 0, INT_MAX,	/* default, min, max */
+							1000, 0, INT_MAX,	/* default, min, max */
 							PGC_SIGHUP, /* context */
 							GUC_UNIT_MS,	/* flags */
 							NULL, NULL, NULL);
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 [[package]]
 name = "aiohttp"
@@ -855,35 +855,31 @@ files = [
 [[package]]
 name = "cryptography"
-version = "39.0.1"
+version = "41.0.0"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
+    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
-    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
+    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
+    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
+    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
-    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
+    {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
-    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
+    {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
-    {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
-    {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
+    {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
    {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
 ]
 [package.dependencies]
@@ -892,12 +888,12 @@ cffi = ">=1.12"
 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
-pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
+nox = ["nox"]
-sdist = ["setuptools-rust (>=0.11.4)"]
+pep8test = ["black", "check-sdist", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
+test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 tox = ["tox"]
 [[package]]
 name = "docker"
@@ -1658,71 +1654,74 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 [[package]]
 name = "psycopg2-binary"
-version = "2.9.3"
+version = "2.9.6"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
+    {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"},
    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"},
    {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"},
    {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"},
 ]
 [[package]]
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
 atty.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -30,6 +29,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
 pbkdf2.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -38,6 +38,7 @@ rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 reqwest-middleware.workspace = true
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -93,6 +93,15 @@ With the current approach we made the following design decisions:
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
 ### Output options
 User can pass several optional headers that will affect resulting json.
 1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.
 2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
 cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
 ## Using SNI-based routing on localhost
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -136,18 +136,17 @@ impl Default for ConnCfg {
 impl ConnCfg {
    /// Establish a raw TCP connection to the compute node.
-    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
+    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
        use tokio_postgres::config::Host;
        // wrap TcpStream::connect with timeout
        let connect_with_timeout = |host, port| {
-            let connection_timeout = Duration::from_millis(10000);
+            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
                move |res| match res {
                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
                    Err(_) => Err(io::Error::new(
                        io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {connection_timeout:?}"),
+                        format!("exceeded connection timeout {timeout:?}"),
                    )),
                },
            )
@@ -223,8 +222,9 @@ impl ConnCfg {
    async fn do_connect(
        &self,
        allow_self_signed_compute: bool,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let (socket_addr, stream, host) = self.connect_raw().await?;
+        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
        let tls_connector = native_tls::TlsConnector::builder()
            .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -264,8 +264,9 @@ impl ConnCfg {
    pub async fn connect(
        &self,
        allow_self_signed_compute: bool,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        self.do_connect(allow_self_signed_compute)
+        self.do_connect(allow_self_signed_compute, timeout)
            .inspect_err(|err| {
                // Immediately log the error we have at our disposal.
                error!("couldn't connect to compute node: {err}");
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -212,7 +212,7 @@ pub struct CacheOptions {
 impl CacheOptions {
    /// Default options for [`crate::auth::caches::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=5m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";
    /// Parse cache options passed via cmdline.
    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -8,7 +8,7 @@ use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, Quer
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::future;
 use tokio::net::{TcpListener, TcpStream};
-use tracing::{error, info, info_span};
+use tracing::{error, info, info_span, Instrument};
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -44,19 +44,30 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> {
            .set_nodelay(true)
            .context("failed to set client socket option")?;
-        tokio::task::spawn(async move {
+        let span = info_span!("mgmt", peer = %peer_addr);
            let span = info_span!("mgmt", peer = %peer_addr);
            let _enter = span.enter();
-            info!("started a new console management API thread");
+        tokio::task::spawn(
-            scopeguard::defer! {
+            async move {
-                info!("console management API thread is about to finish");
+                info!("serving a new console management API connection");
            }
-            if let Err(e) = handle_connection(socket).await {
+                // these might be long running connections, have a separate logging for cancelling
-                error!("thread failed with an error: {e}");
+                // on shutdown and other ways of stopping.
                let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
                    let _e = span.entered();
                    info!("console management API task cancelled");
                });
                if let Err(e) = handle_connection(socket).await {
                    error!("serving failed with an error: {e}");
                } else {
                    info!("serving completed");
                }
                // we can no longer get dropped
                scopeguard::ScopeGuard::into_inner(cancelled);
            }
-        });
+            .instrument(span),
        );
    }
 }
@@ -77,14 +88,14 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
        pgb: &mut PostgresBackendTCP,
        query: &str,
    ) -> Result<(), QueryError> {
-        try_process_query(pgb, query).await.map_err(|e| {
+        try_process_query(pgb, query).map_err(|e| {
            error!("failed to process response: {e:?}");
            e
        })
    }
 }
-async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
+fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
    let span = info_span!("event", session_id = resp.session_id);
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,12 +2,16 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).
 pub mod conn_pool;
 pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
 use std::time::Duration;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -21,6 +25,24 @@ pub fn new_client() -> ClientWithMiddleware {
        .build()
 }
 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
    let retry_policy =
        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
    reqwest_middleware::ClientBuilder::new(timeout_client)
        .with(reqwest_tracing::TracingMiddleware::default())
        // As per docs, "This middleware always errors when given requests with streaming bodies".
        // That's all right because we only use this client to send `serde_json::RawValue`, which
        // is not a stream.
        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
        .build()
 }
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
 pub struct Endpoint {
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -0,0 +1,278 @@
 use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
 use std::{collections::HashMap, sync::Arc};
 use futures::TryFutureExt;
 use crate::config;
 use crate::{auth, console};
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 use crate::proxy::invalidate_cache;
 use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
 use tracing::error;
 use tracing::info;
 pub const APP_NAME: &str = "sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;
 #[derive(Debug)]
 pub struct ConnInfo {
    pub username: String,
    pub dbname: String,
    pub hostname: String,
    pub password: String,
 }
 impl ConnInfo {
    // hm, change to hasher to avoid cloning?
    pub fn db_and_user(&self) -> (String, String) {
        (self.dbname.clone(), self.username.clone())
    }
 }
 impl fmt::Display for ConnInfo {
    // use custom display to avoid logging password
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname)
    }
 }
 struct ConnPoolEntry {
    conn: tokio_postgres::Client,
    _last_access: std::time::Instant,
 }
 // Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
    total_conns: usize,
 }
 pub struct GlobalConnPool {
    // endpoint -> per-endpoint connection pool
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
    // Maximum number of connections per one endpoint.
    // Can mix different (dbname, username) connections.
    // When running out of free slots for a particular endpoint,
    // falls back to opening a new connection for each request.
    max_conns_per_endpoint: usize,
    proxy_config: &'static crate::config::ProxyConfig,
 }
 impl GlobalConnPool {
    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
        Arc::new(Self {
            global_pool: Mutex::new(HashMap::new()),
            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
            proxy_config: config,
        })
    }
    pub async fn get(
        &self,
        conn_info: &ConnInfo,
        force_new: bool,
    ) -> anyhow::Result<tokio_postgres::Client> {
        let mut client: Option<tokio_postgres::Client> = None;
        if !force_new {
            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
            // find a pool entry by (dbname, username) if exists
            let mut pool = pool.lock();
            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
            if let Some(pool_entries) = pool_entries {
                if let Some(entry) = pool_entries.pop() {
                    client = Some(entry.conn);
                    pool.total_conns -= 1;
                }
            }
        }
        // ok return cached connection if found and establish a new one otherwise
        if let Some(client) = client {
            if client.is_closed() {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                connect_to_compute(self.proxy_config, conn_info).await
            } else {
                info!("pool: reusing connection '{conn_info}'");
                Ok(client)
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
            connect_to_compute(self.proxy_config, conn_info).await
        }
    }
    pub async fn put(
        &self,
        conn_info: &ConnInfo,
        client: tokio_postgres::Client,
    ) -> anyhow::Result<()> {
        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
        // return connection to the pool
        let mut total_conns;
        let mut returned = false;
        let mut per_db_size = 0;
        {
            let mut pool = pool.lock();
            total_conns = pool.total_conns;
            let pool_entries: &mut Vec<ConnPoolEntry> = pool
                .pools
                .entry(conn_info.db_and_user())
                .or_insert_with(|| Vec::with_capacity(1));
            if total_conns < self.max_conns_per_endpoint {
                pool_entries.push(ConnPoolEntry {
                    conn: client,
                    _last_access: std::time::Instant::now(),
                });
                total_conns += 1;
                returned = true;
                per_db_size = pool_entries.len();
                pool.total_conns += 1;
            }
        }
        // do logging outside of the mutex
        if returned {
            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
        } else {
            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
        }
        Ok(())
    }
    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
        // find or create a pool for this endpoint
        let mut created = false;
        let mut global_pool = self.global_pool.lock();
        let pool = global_pool
            .entry(endpoint.clone())
            .or_insert_with(|| {
                created = true;
                Arc::new(Mutex::new(EndpointConnPool {
                    pools: HashMap::new(),
                    total_conns: 0,
                }))
            })
            .clone();
        let global_pool_size = global_pool.len();
        drop(global_pool);
        // log new global pool size
        if created {
            info!(
                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
            );
        }
        pool
    }
 }
 //
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
 //
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
 ) -> anyhow::Result<tokio_postgres::Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());
    let credential_params = StartupMessageParams::new([
        ("user", &conn_info.username),
        ("database", &conn_info.dbname),
        ("application_name", APP_NAME),
    ]);
    let creds = config
        .auth_backend
        .as_ref()
        .map(|_| {
            auth::ClientCredentials::parse(
                &credential_params,
                Some(&conn_info.hostname),
                common_names,
            )
        })
        .transpose()?;
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
    };
    let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
    // the difference that it uses `tokio_postgres` for the connection.
    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
    loop {
        match connect_to_compute_once(node_info, conn_info).await {
            Err(e) if num_retries > 0 => {
                info!("compute node's state has changed; requesting a wake-up");
                match creds.wake_compute(&extra).await? {
                    // Update `node_info` and try one more time.
                    Some(new) => {
                        *node_info = new;
                    }
                    // Link auth doesn't work that way, so we just exit.
                    None => return Err(e),
                }
            }
            other => return other,
        }
        num_retries -= 1;
        info!("retrying after wake-up ({num_retries} attempts left)");
    }
 }
 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
 ) -> anyhow::Result<tokio_postgres::Client> {
    let mut config = (*node_info.config).clone();
    let (client, connection) = config
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
        .max_backend_message_size(MAX_RESPONSE_SIZE)
        .connect(tokio_postgres::NoTls)
        .inspect_err(|e: &tokio_postgres::Error| {
            error!(
                "failed to connect to compute node hosts={:?} ports={:?}: {}",
                node_info.config.get_hosts(),
                node_info.config.get_ports(),
                e
            );
            invalidate_cache(node_info)
        })
        .await?;
    tokio::spawn(async move {
        if let Err(e) = connection.await {
            error!("connection error: {}", e);
        }
    });
    Ok(client)
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,8 +1,11 @@
 use std::sync::Arc;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
 use pq_proto::StartupMessageParams;
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
@@ -11,7 +14,8 @@ use tokio_postgres::types::Type;
 use tokio_postgres::Row;
 use url::Url;
-use crate::{auth, config::ProxyConfig, console};
+use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -19,25 +23,33 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }
-const APP_NAME: &str = "sql_over_http";
+pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
    json.iter()
        .map(|value| {
            match value {
-                Value::Null => serde_json::to_string(value),
+                // special care for nulls
-                Value::Bool(_) => serde_json::to_string(value),
+                Value::Null => Ok(None),
                Value::Number(_) => serde_json::to_string(value),
                Value::Object(_) => serde_json::to_string(value),
-                // no need to escape
+                // convert to text with escaping
-                Value::String(s) => Ok(s.to_string()),
+                Value::Bool(_) => serde_json::to_string(value).map(Some),
                Value::Number(_) => serde_json::to_string(value).map(Some),
                Value::Object(_) => serde_json::to_string(value).map(Some),
                // avoid escaping here, as we pass this as a parameter
                Value::String(s) => Ok(Some(s.to_string())),
                // special care for arrays
                Value::Array(_) => json_array_to_pg_array(value),
@@ -54,25 +66,29 @@ fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
 //
 // Example of the same escaping in node-postgres: packages/pg/lib/utils.js
 //
-fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
    match value {
-        // same
+        // special care for nulls
-        Value::Null => serde_json::to_string(value),
+        Value::Null => Ok(None),
        Value::Bool(_) => serde_json::to_string(value),
        Value::Number(_) => serde_json::to_string(value),
        Value::Object(_) => serde_json::to_string(value),
-        // now needs to be escaped, as it is part of the array
+        // convert to text with escaping
-        Value::String(_) => serde_json::to_string(value),
+        Value::Bool(_) => serde_json::to_string(value).map(Some),
        Value::Number(_) => serde_json::to_string(value).map(Some),
        Value::Object(_) => serde_json::to_string(value).map(Some),
        // here string needs to be escaped, as it is part of the array
        Value::String(_) => serde_json::to_string(value).map(Some),
        // recurse into array
        Value::Array(arr) => {
            let vals = arr
                .iter()
                .map(json_array_to_pg_array)
                .map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
                .collect::<Result<Vec<_>, _>>()?
                .join(",");
-            Ok(format!("{{{}}}", vals))
+
            Ok(Some(format!("{{{}}}", vals)))
        }
    }
 }
@@ -80,7 +96,7 @@ fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
 fn get_conn_info(
    headers: &HeaderMap,
    sni_hostname: Option<String>,
-) -> Result<(String, String, String, String), anyhow::Error> {
+) -> Result<ConnInfo, anyhow::Error> {
    let connection_string = headers
        .get("Neon-Connection-String")
        .ok_or(anyhow::anyhow!("missing connection string"))?
@@ -133,56 +149,33 @@ fn get_conn_info(
        }
    }
-    Ok((
+    Ok(ConnInfo {
-        username.to_owned(),
+        username: username.to_owned(),
-        dbname.to_owned(),
+        dbname: dbname.to_owned(),
-        hostname.to_owned(),
+        hostname: hostname.to_owned(),
-        password.to_owned(),
+        password: password.to_owned(),
-    ))
+    })
 }
 // TODO: return different http error codes
 pub async fn handle(
    config: &'static ProxyConfig,
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
 ) -> anyhow::Result<Value> {
    //
    // Determine the destination and connection params
    //
    let headers = request.headers();
-    let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
+    let conn_info = get_conn_info(headers, sni_hostname)?;
    let credential_params = StartupMessageParams::new([
        ("user", &username),
        ("database", &dbname),
        ("application_name", APP_NAME),
    ]);
-    //
+    // Determine the output options. Default behaviour is 'false'. Anything that is not
-    // Wake up the destination if needed. Code here is a bit involved because
+    // strictly 'true' assumed to be false.
-    // we reuse the code from the usual proxy and we need to prepare few structures
+    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-    // that this code expects.
+    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
-    //
+
-    let tls = config.tls_config.as_ref();
+    // Allow connection pooling only if explicitly requested
-    let common_names = tls.and_then(|tls| tls.common_names.clone());
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
    let creds = config
        .auth_backend
        .as_ref()
        .map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
        .transpose()?;
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
    };
    let node = creds.wake_compute(&extra).await?.expect("msg");
    let conf = node.value.config;
    let port = *conf.get_ports().first().expect("no port");
    let host = match conf.get_hosts().first().expect("no host") {
        tokio_postgres::config::Host::Tcp(host) => host,
        tokio_postgres::config::Host::Unix(_) => {
            return Err(anyhow::anyhow!("unix socket is not supported"));
        }
    };
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
@@ -202,28 +195,11 @@ pub async fn handle(
    let QueryData { query, params } = serde_json::from_slice(&body)?;
    let query_params = json_to_pg_text(params)?;
    //
    // Connenct to the destination
    //
    let (client, connection) = tokio_postgres::Config::new()
        .host(host)
        .port(port)
        .user(&username)
        .password(&password)
        .dbname(&dbname)
        .max_backend_message_size(MAX_RESPONSE_SIZE)
        .connect(tokio_postgres::NoTls)
        .await?;
    tokio::spawn(async move {
        if let Err(e) = connection.await {
            eprintln!("connection error: {}", e);
        }
    });
    //
    // Now execute the query and return the result
    //
    let client = conn_pool.get(&conn_info, !allow_pool).await?;
    let row_stream = client.query_raw_txt(query, query_params).await?;
    // Manually drain the stream into a vector to leave row_stream hanging
@@ -262,6 +238,11 @@ pub async fn handle(
                json!({
                    "name": Value::String(c.name().to_owned()),
                    "dataTypeID": Value::Number(c.type_().oid().into()),
                    "tableID": c.table_oid(),
                    "columnID": c.column_id(),
                    "dataTypeSize": c.type_size(),
                    "dataTypeModifier": c.type_modifier(),
                    "format": "text",
                })
            })
            .collect::<Vec<_>>()
@@ -272,35 +253,58 @@ pub async fn handle(
    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(pg_text_row_to_json)
+        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;
    if allow_pool {
        // return connection to the pool
        tokio::task::spawn(async move {
            let _ = conn_pool.put(&conn_info, client).await;
        });
    }
    // resulting JSON format is based on the format of node-postgres result
    Ok(json!({
        "command": command_tag_name,
        "rowCount": command_tag_count,
        "rows": rows,
        "fields": fields,
        "rowAsArray": array_mode,
    }))
 }
 //
 // Convert postgres row with text-encoded values to JSON object
 //
-pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
+pub fn pg_text_row_to_json(
-    let res = row
+    row: &Row,
-        .columns()
+    raw_output: bool,
-        .iter()
+    array_mode: bool,
-        .enumerate()
+) -> Result<Value, anyhow::Error> {
-        .map(|(i, column)| {
+    let iter = row.columns().iter().enumerate().map(|(i, column)| {
-            let name = column.name();
+        let name = column.name();
-            let pg_value = row.as_text(i)?;
+        let pg_value = row.as_text(i)?;
-            let json_value = pg_text_to_json(pg_value, column.type_())?;
+        let json_value = if raw_output {
-            Ok((name.to_string(), json_value))
+            match pg_value {
-        })
+                Some(v) => Value::String(v.to_string()),
-        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+                None => Value::Null,
            }
        } else {
            pg_text_to_json(pg_value, column.type_())?
        };
        Ok((name.to_string(), json_value))
    });
-    Ok(Value::Object(res))
+    if array_mode {
        // drop keys and aggregate into array
        let arr = iter
            .map(|r| r.map(|(_key, val)| val))
            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
        Ok(Value::Array(arr))
    } else {
        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
        Ok(Value::Object(obj))
    }
 }
 //
@@ -308,10 +312,6 @@ pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
 //
 pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
    if let Some(val) = pg_value {
        if val == "NULL" {
            return Ok(Value::Null);
        }
        if let Kind::Array(elem_type) = pg_type.kind() {
            return pg_array_parse(val, elem_type);
        }
@@ -373,6 +373,27 @@ fn _pg_array_parse(
        }
    }
    fn push_checked(
        entry: &mut String,
        entries: &mut Vec<Value>,
        elem_type: &Type,
    ) -> Result<(), anyhow::Error> {
        if !entry.is_empty() {
            // While in usual postgres response we get nulls as None and everything else
            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
            // here while we have quotation info and convert them to None.
            if entry == "NULL" {
                entries.push(pg_text_to_json(None, elem_type)?);
            } else {
                entries.push(pg_text_to_json(Some(entry), elem_type)?);
            }
            entry.clear();
        }
        Ok(())
    }
    while let Some((mut i, mut c)) = pg_array_chr.next() {
        let mut escaped = false;
@@ -395,9 +416,7 @@ fn _pg_array_parse(
            '}' => {
                level -= 1;
                if level == 0 {
-                    if !entry.is_empty() {
+                    push_checked(&mut entry, &mut entries, elem_type)?;
                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
                    }
                    if nested {
                        return Ok((Value::Array(entries), i));
                    }
@@ -405,17 +424,15 @@ fn _pg_array_parse(
            }
            '"' if !escaped => {
                if quote {
-                    // push even if empty
+                    // end of quoted string, so push it manually without any checks
                    // for emptiness or nulls
                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry = String::new();
+                    entry.clear();
                }
                quote = !quote;
            }
            ',' if !quote => {
-                if !entry.is_empty() {
+                push_checked(&mut entry, &mut entries, elem_type)?;
                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
                    entry = String::new();
                }
            }
            _ => {
                entry.push(c);
@@ -439,30 +456,35 @@ mod tests {
    fn test_atomic_types_to_pg_params() {
        let json = vec![Value::Bool(true), Value::Bool(false)];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["true", "false"]);
+        assert_eq!(
            pg_params,
            vec![Some("true".to_owned()), Some("false".to_owned())]
        );
        let json = vec![Value::Number(serde_json::Number::from(42))];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["42"]);
+        assert_eq!(pg_params, vec![Some("42".to_owned())]);
        let json = vec![Value::String("foo\"".to_string())];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["foo\""]);
+        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
        let json = vec![Value::Null];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["null"]);
+        assert_eq!(pg_params, vec![None]);
    }
    #[test]
    fn test_json_array_to_pg_array() {
        // atoms and escaping
-        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
        let json: Value = serde_json::from_str(json).unwrap();
        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
-            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+            vec![Some(
                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
            )]
        );
        // nested arrays
@@ -471,7 +493,9 @@ mod tests {
        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
-            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+            vec![Some(
                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
            )]
        );
    }
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -26,7 +26,6 @@ use tls_listener::TlsListener;
 use tokio::{
    io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
    net::TcpListener,
    select,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -36,7 +35,7 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
-use super::sql_over_http;
+use super::{conn_pool::GlobalConnPool, sql_over_http};
 pin_project! {
    /// This is a wrapper around a [`WebSocketStream`] that
@@ -165,6 +164,7 @@ async fn serve_websocket(
 async fn ws_handler(
    mut request: Request<Body>,
    config: &'static ProxyConfig,
    conn_pool: Arc<GlobalConnPool>,
    cancel_map: Arc<CancelMap>,
    session_id: uuid::Uuid,
    sni_hostname: Option<String>,
@@ -193,14 +193,9 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = select! {
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
-            _ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
+            .instrument(info_span!("sql-over-http"))
-                Err(anyhow::anyhow!("Query timed out"))
+            .await;
            }
            response = sql_over_http::handle(config, request, sni_hostname) => {
                response
            }
        };
        let status_code = match result {
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
@@ -240,6 +235,8 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }
    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
    let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
    let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
        Some(config) => config.into(),
@@ -264,15 +261,18 @@ pub async fn task_main(
    let make_svc =
        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
            let conn_pool = conn_pool.clone();
            async move {
                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
                    let sni_name = sni_name.clone();
                    let conn_pool = conn_pool.clone();
                    async move {
                        let cancel_map = Arc::new(CancelMap::default());
                        let session_id = uuid::Uuid::new_v4();
-                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
                                session = format_args!("{session_id}")
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -18,7 +18,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
        .from_env_lossy();
    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_ansi(false)
        .with_writer(std::io::stderr)
        .with_target(false);
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,11 +4,13 @@ use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tracing::{error, info, instrument, trace, warn};
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 ///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -30,7 +32,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
        info!("metrics collector has shut down");
    }
-    let http_client = http::new_client();
+    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
    let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
@@ -182,36 +184,36 @@ async fn collect_metrics_iteration(
            }
        };
-        if res.status().is_success() {
+        if !res.status().is_success() {
            // update cached metrics after they were sent successfully
            for send_metric in chunk {
                let stop_time = match send_metric.kind {
                    EventType::Incremental { stop_time, .. } => stop_time,
                    _ => unreachable!(),
                };
                cached_metrics
                    .entry(Ids {
                        endpoint_id: send_metric.extra.endpoint_id.clone(),
                        branch_id: send_metric.extra.branch_id.clone(),
                    })
                    // update cached value (add delta) and time
                    .and_modify(|e| {
                        e.0 = e.0.saturating_add(send_metric.value);
                        e.1 = stop_time
                    })
                    // cache new metric
                    .or_insert((send_metric.value, stop_time));
            }
        } else {
            error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter() {
+            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
                // Report if the metric value is suspiciously large
-                if metric.value > (1u64 << 40) {
+                error!("potentially abnormal metric value: {:?}", metric);
                    error!("potentially abnormal metric value: {:?}", metric);
                }
            }
        }
        // update cached metrics after they were sent
        // (to avoid sending the same metrics twice)
        // see the relevant discussion on why to do so even if the status is not success:
        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
        for send_metric in chunk {
            let stop_time = match send_metric.kind {
                EventType::Incremental { stop_time, .. } => stop_time,
                _ => unreachable!(),
            };
            cached_metrics
                .entry(Ids {
                    endpoint_id: send_metric.extra.endpoint_id.clone(),
                    branch_id: send_metric.extra.branch_id.clone(),
                })
                // update cached value (add delta) and time
                .and_modify(|e| {
                    e.0 = e.0.saturating_add(send_metric.value);
                    e.1 = stop_time
                })
                // cache new metric
                .or_insert((send_metric.value, stop_time));
        }
    }
    Ok(())
 }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,13 +16,16 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
    time,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 /// Number of times we should retry the `/proxy_wake_compute` http request.
-const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
+pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -283,34 +286,36 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    }
 }
 /// If we couldn't connect, a cached connection info might be to blame
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
 pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
    let is_cached = node_info.cached();
    if is_cached {
        warn!("invalidating stalled compute node info cache entry");
        node_info.invalidate();
    }
    let label = match is_cached {
        true => "compute_cached",
        false => "compute_uncached",
    };
    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
 }
 /// Try to connect to the compute node once.
 #[tracing::instrument(name = "connect_once", skip_all)]
 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    timeout: time::Duration,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
    // If we couldn't connect, a cached connection info might be to blame
    // (e.g. the compute node's address might've changed at the wrong time).
    // Invalidate the cache entry (if any) to prevent subsequent errors.
    let invalidate_cache = |_: &compute::ConnectionError| {
        let is_cached = node_info.cached();
        if is_cached {
            warn!("invalidating stalled compute node info cache entry");
            node_info.invalidate();
        }
        let label = match is_cached {
            true => "compute_cached",
            false => "compute_uncached",
        };
        NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
    };
    let allow_self_signed_compute = node_info.allow_self_signed_compute;
    node_info
        .config
-        .connect(allow_self_signed_compute)
+        .connect(allow_self_signed_compute, timeout)
-        .inspect_err(invalidate_cache)
+        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
        .await
 }
@@ -327,7 +332,27 @@ async fn connect_to_compute(
    loop {
        // Apply startup params to the (possibly, cached) compute node info.
        node_info.config.set_startup_params(params);
-        match connect_to_compute_once(node_info).await {
+
        // Set a shorter timeout for the initial connection attempt.
        //
        // In case we try to connect to an outdated address that is no longer valid, the
        // default behavior of Kubernetes is to drop the packets, causing us to wait for
        // the entire timeout period. We want to fail fast in such cases.
        //
        // A specific case to consider is when we have cached compute node information
        // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
        // call, resulting in the nonexistence of the compute node.
        //
        // We only use caching in case of scram proxy backed by the console, so reduce
        // the timeout only in that case.
        let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
            time::Duration::from_secs(2)
        } else {
            time::Duration::from_secs(10)
        };
        match connect_to_compute_once(node_info, timeout).await {
            Err(e) if num_retries > 0 => {
                info!("compute node's state has changed; requesting a wake-up");
                match creds.wake_compute(extra).map_err(io_error).await? {
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -45,17 +45,74 @@ fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator<Item = &'a [u8]>) -> [u8
    let mut mac = Hmac::<Sha256>::new_from_slice(key).expect("bad key size");
    parts.into_iter().for_each(|s| mac.update(s));
-    // TODO: maybe newer `hmac` et al already migrated to regular arrays?
+    mac.finalize().into_bytes().into()
    let mut result = [0u8; 32];
    result.copy_from_slice(mac.finalize().into_bytes().as_slice());
    result
 }
 fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
    let mut hasher = Sha256::new();
    parts.into_iter().for_each(|s| hasher.update(s));
-    let mut result = [0u8; 32];
+    hasher.finalize().into()
-    result.copy_from_slice(hasher.finalize().as_slice());
+}
-    result
+
 #[cfg(test)]
 mod tests {
    use crate::sasl::{Mechanism, Step};
    use super::{password::SaltedPassword, Exchange, ServerSecret};
    #[test]
    fn happy_path() {
        let iterations = 4096;
        let salt_base64 = "QSXCR+Q6sek8bf92";
        let pw = SaltedPassword::new(
            b"pencil",
            base64::decode(salt_base64).unwrap().as_slice(),
            iterations,
        );
        let secret = ServerSecret {
            iterations,
            salt_base64: salt_base64.to_owned(),
            stored_key: pw.client_key().sha256(),
            server_key: pw.server_key(),
            doomed: false,
        };
        const NONCE: [u8; 18] = [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        ];
        let mut exchange = Exchange::new(&secret, || NONCE, None);
        let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO";
        let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0=";
        let server_first =
            "r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,s=QSXCR+Q6sek8bf92,i=4096";
        let server_final = "v=qtUDIofVnIhM7tKn93EQUUt5vgMOldcDVu1HC+OH0o0=";
        exchange = match exchange.exchange(client_first).unwrap() {
            Step::Continue(exchange, message) => {
                assert_eq!(message, server_first);
                exchange
            }
            Step::Success(_, _) => panic!("expected continue, got success"),
            Step::Failure(f) => panic!("{f}"),
        };
        let key = match exchange.exchange(client_final).unwrap() {
            Step::Success(key, message) => {
                assert_eq!(message, server_final);
                key
            }
            Step::Continue(_, _) => panic!("expected success, got continue"),
            Step::Failure(f) => panic!("{f}"),
        };
        assert_eq!(
            key.as_bytes(),
            [
                74, 103, 1, 132, 12, 31, 200, 48, 28, 54, 82, 232, 207, 12, 138, 189, 40, 32, 134,
                27, 125, 170, 232, 35, 171, 167, 166, 41, 70, 228, 182, 112,
            ]
        );
    }
 }
--- a/proxy/src/scram/password.rs
+++ b/proxy/src/scram/password.rs
@@ -14,19 +14,7 @@ impl SaltedPassword {
    /// See `scram-common.c : scram_SaltedPassword` for details.
    /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
    pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        let one = 1_u32.to_be_bytes(); // magic
+        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
        let mut current = super::hmac_sha256(password, [salt, &one]);
        let mut result = current;
        for _ in 1..iterations {
            current = super::hmac_sha256(password, [current.as_ref()]);
            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
            for (i, x) in current.iter().enumerate() {
                result[i] ^= x;
            }
        }
        result.into()
    }
    /// Derive `ClientKey` from a salted hashed password.
@@ -46,3 +34,41 @@ impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
        Self { bytes }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::SaltedPassword;
    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
        let one = 1_u32.to_be_bytes(); // magic
        let mut current = super::super::hmac_sha256(password, [salt, &one]);
        let mut result = current;
        for _ in 1..iterations {
            current = super::super::hmac_sha256(password, [current.as_ref()]);
            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
            for (i, x) in current.iter().enumerate() {
                result[i] ^= x;
            }
        }
        result.into()
    }
    #[test]
    fn pbkdf2() {
        let password = "a-very-secure-password";
        let salt = "such-a-random-salt";
        let iterations = 4096;
        let output = [
            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
        ];
        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
        assert_eq!(actual.bytes, output);
        assert_eq!(actual.bytes, expected.bytes);
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ authors = []
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.3.1"
-psycopg2-binary = "^2.9.1"
+psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.31.0"
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.68.2"
+channel = "1.70.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -3,15 +3,19 @@
 //
 use anyhow::{bail, Context, Result};
 use clap::Parser;
 use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
 use remote_storage::RemoteStorageConfig;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
 use toml_edit::Document;
 use utils::signals::ShutdownSignals;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::thread;
 use std::time::Duration;
 use storage_broker::Uri;
 use tokio::sync::mpsc;
@@ -20,22 +24,21 @@ use tracing::*;
 use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::broker;
 use safekeeper::control_file;
 use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
 use safekeeper::remove_wal;
 use safekeeper::wal_backup;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
 use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::JwtAuth;
 use utils::{
    http::endpoint,
    id::NodeId,
    logging::{self, LogFormat},
    project_git_version,
@@ -104,10 +107,6 @@ struct Args {
    /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
    #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
    max_offloader_lag: u64,
    /// Number of threads for wal backup runtime, by default number of cores
    /// available to the system.
    #[arg(long)]
    wal_backup_threads: Option<usize>,
    /// Number of max parallel WAL segments to be offloaded to remote storage.
    #[arg(long, default_value = "5")]
    wal_backup_parallel_jobs: usize,
@@ -121,9 +120,14 @@ struct Args {
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
    /// Run everything in single threaded current thread runtime, might be
    /// useful for debugging.
    #[arg(long)]
    current_thread_runtime: bool,
 }
-fn main() -> anyhow::Result<()> {
+#[tokio::main(flavor = "current_thread")]
 async fn main() -> anyhow::Result<()> {
    let args = Args::parse();
    if let Some(addr) = args.dump_control_file {
@@ -183,10 +187,10 @@ fn main() -> anyhow::Result<()> {
        heartbeat_timeout: args.heartbeat_timeout,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
        backup_runtime_threads: args.wal_backup_threads,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
        auth,
        current_thread_runtime: args.current_thread_runtime,
    };
    // initialize sentry if SENTRY_DSN is provided
@@ -194,10 +198,14 @@ fn main() -> anyhow::Result<()> {
        Some(GIT_VERSION.into()),
        &[("node_id", &conf.my_id.to_string())],
    );
-    start_safekeeper(conf)
+    start_safekeeper(conf).await
 }
-fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+/// Result of joining any of main tasks: upper error means task failed to
 /// complete, e.g. panicked, inner is error produced by task itself.
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // Prevent running multiple safekeepers on the same directory
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file =
@@ -208,14 +216,18 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // we need to release the lock file only when the current process is gone
    std::mem::forget(lock_file);
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
+    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
-        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
+    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
        e
    })?;
-    info!("starting safekeeper on {}", conf.listen_pg_addr);
+    info!(
-    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
+        "starting safekeeper HTTP service on {}",
-        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+        conf.listen_http_addr
    );
    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;
@@ -224,71 +236,88 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
    metrics::register_internal(Box::new(timeline_collector))?;
    let mut threads = vec![];
    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
    // Load all timelines from disk to memory.
    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
-    let conf_ = conf.clone();
+    // Keep handles to main tasks to die if any of them disappears.
-    threads.push(
+    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
-        thread::Builder::new()
+        FuturesUnordered::new();
            .name("http_endpoint_thread".into())
            .spawn(|| {
                let router = http::make_router(conf_);
                endpoint::serve_thread_main(
                    router,
                    http_listener,
                    std::future::pending(), // never shut down
                )
                .unwrap();
            })?,
    );
    let conf_cloned = conf.clone();
    let safekeeper_thread = thread::Builder::new()
        .name("WAL service thread".into())
        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
        .unwrap();
    threads.push(safekeeper_thread);
    let conf_ = conf.clone();
-    threads.push(
+    // Run everything in current thread rt, if asked.
-        thread::Builder::new()
+    if conf.current_thread_runtime {
-            .name("broker thread".into())
+        info!("running in current thread runtime");
-            .spawn(|| {
+    }
-                broker::thread_main(conf_);
+    let current_thread_rt = conf
-            })?,
+        .current_thread_runtime
-    );
+        .then(|| Handle::try_current().expect("no runtime in main"));
    let wal_service_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
        .spawn(wal_service::task_main(conf_, pg_listener))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
    tasks_handles.push(Box::pin(wal_service_handle));
    let conf_ = conf.clone();
-    threads.push(
+    let http_handle = current_thread_rt
-        thread::Builder::new()
+        .as_ref()
-            .name("WAL removal thread".into())
+        .unwrap_or_else(|| HTTP_RUNTIME.handle())
-            .spawn(|| {
+        .spawn(http::task_main(conf_, http_listener))
-                remove_wal::thread_main(conf_);
+        .map(|res| ("HTTP service main".to_owned(), res));
-            })?,
+    tasks_handles.push(Box::pin(http_handle));
    );
-    threads.push(
+    let conf_ = conf.clone();
-        thread::Builder::new()
+    let broker_task_handle = current_thread_rt
-            .name("WAL backup launcher thread".into())
+        .as_ref()
-            .spawn(move || {
+        .unwrap_or_else(|| BROKER_RUNTIME.handle())
-                wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
+        .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
-            })?,
+        .map(|res| ("broker main".to_owned(), res));
-    );
+    tasks_handles.push(Box::pin(broker_task_handle));
    let conf_ = conf.clone();
    let wal_remover_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
        .spawn(remove_wal::task_main(conf_))
        .map(|res| ("WAL remover".to_owned(), res));
    tasks_handles.push(Box::pin(wal_remover_handle));
    let conf_ = conf.clone();
    let wal_backup_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
        .spawn(wal_backup::wal_backup_launcher_task_main(
            conf_,
            wal_backup_launcher_rx,
        ))
        .map(|res| ("WAL backup launcher".to_owned(), res));
    tasks_handles.push(Box::pin(wal_backup_handle));
    set_build_info_metric(GIT_VERSION);
    // TODO: put more thoughts into handling of failed threads
    // We should catch & die if they are in trouble.
-    // On any shutdown signal, log receival and exit. Additionally, handling
+    // TODO: update tokio-stream, convert to real async Stream with
-    // SIGQUIT prevents coredump.
+    // SignalStream, map it to obtain missing signal name, combine streams into
-    ShutdownSignals::handle(|signal| {
+    // single stream we can easily sit on.
-        info!("received {}, terminating", signal.name());
+    let mut sigquit_stream = signal(SignalKind::quit())?;
-        std::process::exit(0);
+    let mut sigint_stream = signal(SignalKind::interrupt())?;
-    })
+    let mut sigterm_stream = signal(SignalKind::terminate())?;
    tokio::select! {
        Some((task_name, res)) = tasks_handles.next()=> {
            error!("{} task failed: {:?}, exiting", task_name, res);
            std::process::exit(1);
        }
        // On any shutdown signal, log receival and exit. Additionally, handling
        // SIGQUIT prevents coredump.
        _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
        _ = sigint_stream.recv() => info!("received SIGINT, terminating"),
        _ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
    };
    std::process::exit(0);
 }
 /// Determine safekeeper id.
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -8,7 +8,7 @@ use anyhow::Error;
 use anyhow::Result;
 use storage_broker::parse_proto_ttid;
-use storage_broker::proto::broker_service_client::BrokerServiceClient;
+
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::Request;
@@ -16,7 +16,7 @@ use storage_broker::Request;
 use std::time::Duration;
 use std::time::Instant;
 use tokio::task::JoinHandle;
-use tokio::{runtime, time::sleep};
+use tokio::time::sleep;
 use tracing::*;
 use crate::metrics::BROKER_ITERATION_TIMELINES;
@@ -29,23 +29,10 @@ use crate::SafeKeeperConf;
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
 pub fn thread_main(conf: SafeKeeperConf) {
    let runtime = runtime::Builder::new_current_thread()
        .enable_all()
        .build()
        .unwrap();
    let _enter = info_span!("broker").entered();
    info!("started, broker endpoint {:?}", conf.broker_endpoint);
    runtime.block_on(async {
        main_loop(conf).await;
    });
 }
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
+    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
    let outbound = async_stream::stream! {
@@ -55,20 +42,27 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
            // sensitive and there is no risk of deadlock as we don't await while
            // lock is held.
            let now = Instant::now();
-            let mut active_tlis = GlobalTimelines::get_all();
+            let all_tlis = GlobalTimelines::get_all();
-            active_tlis.retain(|tli| tli.is_active());
+            let mut n_pushed_tlis = 0;
-            for tli in &active_tlis {
+            for tli in &all_tlis {
-                let sk_info = tli.get_safekeeper_info(&conf);
+                // filtering alternative futures::stream::iter(all_tlis)
                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
                // doesn't look better, and I'm not sure how to do that without collect.
                if !tli.is_active().await {
                    continue;
                }
                let sk_info = tli.get_safekeeper_info(&conf).await;
                yield sk_info;
                BROKER_PUSHED_UPDATES.inc();
                n_pushed_tlis += 1;
            }
            let elapsed = now.elapsed();
            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
-            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+            BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64);
            if elapsed > push_interval / 2 {
-                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed);
            }
            sleep(push_interval).await;
@@ -125,10 +119,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    bail!("end of stream");
 }
-async fn main_loop(conf: SafeKeeperConf) {
+pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    info!("started, broker endpoint {:?}", conf.broker_endpoint);
    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,9 +2,10 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use tokio::fs::{self, File};
 use tokio::io::AsyncWriteExt;
-use std::fs::{self, File, OpenOptions};
+use std::io::Read;
 use std::io::{Read, Write};
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::time::Instant;
@@ -26,9 +27,10 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
 #[async_trait::async_trait]
 pub trait Storage: Deref<Target = SafeKeeperState> {
    /// Persist safekeeper state on disk and update internal state.
-    fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
+    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
    /// Timestamp of last persist.
    fn last_persist_at(&self) -> Instant;
@@ -82,7 +84,7 @@ impl FileStorage {
    /// Check the magic/version in the on-disk data and deserialize it, if possible.
    fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
        // Read the version independent part
-        let magic = buf.read_u32::<LittleEndian>()?;
+        let magic = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
        if magic != SK_MAGIC {
            bail!(
                "bad control file magic: {:X}, expected {:X}",
@@ -90,7 +92,7 @@ impl FileStorage {
                SK_MAGIC
            );
        }
-        let version = buf.read_u32::<LittleEndian>()?;
+        let version = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
        if version == SK_FORMAT_VERSION {
            let res = SafeKeeperState::des(buf)?;
            return Ok(res);
@@ -110,7 +112,7 @@ impl FileStorage {
    /// Read in the control file.
    pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
-        let mut control_file = OpenOptions::new()
+        let mut control_file = std::fs::OpenOptions::new()
            .read(true)
            .write(true)
            .open(&control_file_path)
@@ -159,30 +161,31 @@ impl Deref for FileStorage {
    }
 }
 #[async_trait::async_trait]
 impl Storage for FileStorage {
    /// persists state durably to underlying storage
    /// for description see https://lwn.net/Articles/457667/
-    fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
+    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
        let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
        // write data to safekeeper.control.partial
        let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
-        let mut control_partial = File::create(&control_partial_path).with_context(|| {
+        let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
            format!(
                "failed to create partial control file at: {}",
                &control_partial_path.display()
            )
        })?;
        let mut buf: Vec<u8> = Vec::new();
-        buf.write_u32::<LittleEndian>(SK_MAGIC)?;
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-        buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
        s.ser_into(&mut buf)?;
        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
        buf.extend_from_slice(&checksum.to_le_bytes());
-        control_partial.write_all(&buf).with_context(|| {
+        control_partial.write_all(&buf).await.with_context(|| {
            format!(
                "failed to write safekeeper state into control file at: {}",
                control_partial_path.display()
@@ -191,7 +194,7 @@ impl Storage for FileStorage {
        // fsync the file
        if !self.conf.no_sync {
-            control_partial.sync_all().with_context(|| {
+            control_partial.sync_all().await.with_context(|| {
                format!(
                    "failed to sync partial control file at {}",
                    control_partial_path.display()
@@ -202,21 +205,22 @@ impl Storage for FileStorage {
        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path)?;
+        fs::rename(&control_partial_path, &control_path).await?;
        // this sync is not required by any standard but postgres does this (see durable_rename)
        if !self.conf.no_sync {
-            File::open(&control_path)
+            let new_f = File::open(&control_path).await?;
-                .and_then(|f| f.sync_all())
+            new_f.sync_all().await.with_context(|| {
-                .with_context(|| {
+                format!(
-                    format!(
+                    "failed to sync control file at: {}",
-                        "failed to sync control file at: {}",
+                    &control_path.display()
-                        &control_path.display()
+                )
-                    )
+            })?;
                })?;
            // fsync the directory (linux specific)
-            File::open(&self.timeline_dir)
+            let tli_dir = File::open(&self.timeline_dir).await?;
-                .and_then(|f| f.sync_all())
+            tli_dir
                .sync_all()
                .await
                .context("failed to sync control file directory")?;
        }
@@ -236,7 +240,6 @@ mod test {
    use super::*;
    use crate::{safekeeper::SafeKeeperState, SafeKeeperConf};
    use anyhow::Result;
    use std::fs;
    use utils::{id::TenantTimelineId, lsn::Lsn};
    fn stub_conf() -> SafeKeeperConf {
@@ -247,59 +250,75 @@ mod test {
        }
    }
-    fn load_from_control_file(
+    async fn load_from_control_file(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
            FileStorage::load_control_file_conf(conf, ttid)?,
        ))
    }
-    fn create(
+    async fn create(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        let state = SafeKeeperState::empty();
        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
        Ok((storage, state))
    }
-    #[test]
+    #[tokio::test]
-    fn test_read_write_safekeeper_state() {
+    async fn test_read_write_safekeeper_state() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state");
+            let (mut storage, mut state) =
                create(&conf, &ttid).await.expect("failed to create state");
            // change something
            state.commit_lsn = Lsn(42);
-            storage.persist(&state).expect("failed to persist state");
+            storage
                .persist(&state)
                .await
                .expect("failed to persist state");
        }
-        let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state");
+        let (_, state) = load_from_control_file(&conf, &ttid)
            .await
            .expect("failed to read state");
        assert_eq!(state.commit_lsn, Lsn(42));
    }
-    #[test]
+    #[tokio::test]
-    fn test_safekeeper_state_checksum_mismatch() {
+    async fn test_safekeeper_state_checksum_mismatch() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state");
+            let (mut storage, mut state) =
                create(&conf, &ttid).await.expect("failed to read state");
            // change something
            state.commit_lsn = Lsn(42);
-            storage.persist(&state).expect("failed to persist state");
+            storage
                .persist(&state)
                .await
                .expect("failed to persist state");
        }
        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
-        let mut data = fs::read(&control_path).unwrap();
+        let mut data = fs::read(&control_path).await.unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
-        fs::write(&control_path, &data).expect("failed to write control file");
+        fs::write(&control_path, &data)
            .await
            .expect("failed to write control file");
-        match load_from_control_file(&conf, &ttid) {
+        match load_from_control_file(&conf, &ttid).await {
            Err(err) => assert!(err
                .to_string()
                .contains("safekeeper control file checksum mismatch")),
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -121,7 +121,7 @@ pub struct FileInfo {
 }
 /// Build debug dump response, using the provided [`Args`] filters.
-pub fn build(args: Args) -> Result<Response> {
+pub async fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();
@@ -155,7 +155,7 @@ pub fn build(args: Args) -> Result<Response> {
        }
        let control_file = if args.dump_control_file {
-            let mut state = tli.get_state().1;
+            let mut state = tli.get_state().await.1;
            if !args.dump_term_history {
                state.acceptor_state.term_history = TermHistory(vec![]);
            }
@@ -165,7 +165,7 @@ pub fn build(args: Args) -> Result<Response> {
        };
        let memory = if args.dump_memory {
-            Some(tli.memory_dump())
+            Some(tli.memory_dump().await)
        } else {
            None
        };
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler {
        let lsn = if self.is_walproposer_recovery() {
            // walproposer should get all local WAL until flush_lsn
-            tli.get_flush_lsn()
+            tli.get_flush_lsn().await
        } else {
            // other clients shouldn't get any uncommitted WAL
-            tli.get_state().0.commit_lsn
+            tli.get_state().await.0.commit_lsn
        }
        .to_string();
-        let sysid = tli.get_state().1.server.system_id.to_string();
+        let sysid = tli.get_state().await.1.server.system_id.to_string();
        let lsn_bytes = lsn.as_bytes();
        let tli = PG_TLI.to_string();
        let tli_bytes = tli.as_bytes();
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -2,3 +2,18 @@ pub mod routes;
 pub use routes::make_router;
 pub use safekeeper_api::models;
 use crate::SafeKeeperConf;
 pub async fn task_main(
    conf: SafeKeeperConf,
    http_listener: std::net::TcpListener,
 ) -> anyhow::Result<()> {
    let router = make_router(conf)
        .build()
        .map_err(|err| anyhow::anyhow!(err))?;
    let service = utils::http::RouterService::new(router).unwrap();
    let server = hyper::Server::from_tcp(http_listener)?;
    server.serve(service).await?;
    Ok(()) // unreachable
 }
--- a/Show More
+++ b/Show More