Merge remote-tracking branch 'origin/main' into arpad/lsn_by_ts

build: upgrade ahash (#5851 )
`cargo deny` was complaining the version 0.8.3 was yanked (for possible DoS attack [wiki]), but the latest version (0.8.5) also includes aarch64 fixes which may or may not be relevant. Our usage of ahash limits to proxy, but I don't think we are at any risk. [wiki]: https://github.com/tkaitchuck/aHash/wiki/Yanked-versions
2026-05-31 20:10:38 +00:00 · 2023-11-11 02:24:51 +01:00 · 2023-11-10 19:10:54 +00:00 · 2023-11-10 18:02:22 +01:00 · 2023-11-10 16:05:21 +02:00 · 2023-11-10 13:58:18 +00:00
237 changed files with 16132 additions and 10091 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas


-## Tasks
- [ ]
+```[tasklist]
+### Tasks
+```


 ## Other related tasks and Epics
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,8 +1,12 @@
 self-hosted-runner:
  labels:
+    - arm64
+    - dev
    - gen3
    - large
    - small
    - us-east-2
 config-variables:
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,6 +203,10 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
+        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
+          exit 0
+        fi
+
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,10 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -320,6 +321,9 @@ jobs:
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
@@ -335,6 +339,16 @@ jobs:
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -420,7 +434,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -455,7 +469,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -710,6 +724,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -834,7 +849,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.1
+      VM_BUILDER_VERSION: v0.18.5

    steps:
      - name: Checkout
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -32,7 +35,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -90,18 +93,21 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(nproc)
+        run: make postgres-v14 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(nproc)
+        run: make postgres-v15 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(nproc)
+        run: make postgres-v16 -j$(sysctl -n hw.ncpu)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(nproc)
+        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
+
+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(sysctl -n hw.ncpu)

      - name: Run cargo build
        run: cargo build --all --release
@@ -109,8 +115,182 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -126,7 +306,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -135,6 +315,9 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(nproc)
+
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 2'
+    - cron: '0 7 * * 5'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
+    "libs/walproposer",
 ]

 [workspace.package]
@@ -35,6 +36,7 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -46,6 +48,7 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
+aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -64,7 +67,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -80,7 +83,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -123,6 +126,7 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
+serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -160,11 +164,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -185,6 +189,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
+walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
@@ -200,7 +205,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }

 ################# Binary contents sections

--- a/5
+++ b/5
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/42
+++ b/42
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -72,6 +72,10 @@ neon: postgres-headers
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
@@ -168,6 +172,42 @@ neon-pg-ext-clean-%:
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

+# Build walproposer as a static library. walproposer source code is located
+# in the pgxn/neon directory.
+# 
+# We also need to include libpgport.a and libpgcommon.a, because walproposer
+# uses some functions from those libraries.
+# 
+# Some object files are removed from libpgport.a and libpgcommon.a because
+# they depend on openssl and other libraries that are not included in our
+# Rust build.
+.PHONY: walproposer-lib
+walproposer-lib: neon-pg-ext-v16
+	+@echo "Compiling walproposer-lib"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
+		pg_strong_random.o
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o \
+		hmac_openssl.o \
+		cryptohash_openssl.o \
+		scram-common.o \
+		md5_common.o \
+		checksum_helper.o
+endif
+
+.PHONY: walproposer-lib-clean
+walproposer-lib-clean:
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -156,6 +156,7 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
+                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -277,32 +278,26 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            use tracing::warn;
-            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let vm_monitor_addr = matches
+                .get_one::<String>("vm-monitor-addr")
+                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
-            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                (None, None) => None,
-                (None, Some(_)) => {
-                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
-                    None
-                }
-                (Some(_), None) => {
-                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
-                }
-                (Some(_), Some(_)) => Some(
+            let rt = if env::var_os("AUTOSCALING").is_some() {
+                Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor"),
-                ),
+                        .expect("failed to create tokio runtime for monitor")
+                )
+            } else {
+                None
            };

            // This token is used internally by the monitor to clean up all threads
@@ -313,8 +308,7 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.cloned().unwrap(),
-                        file_cache_on_disk,
+                        addr: vm_monitor_addr.clone(),
                    })),
                    token.clone(),
                ))
@@ -486,6 +480,8 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
+            // DEPRECATED, NO LONGER DOES ANYTHING.
+            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,8 +710,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -724,9 +728,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -78,7 +78,7 @@ use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::path::Path;
 use std::str;
 use tar::Archive;
@@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
        max_keys_per_list_response: None,
    };
    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
        storage: RemoteStorageKind::AwsS3(config),
    };
    GenericRemoteStorage::from_config(&config)
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .query(
+            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
+            &[],
+        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
+            replication: Some(row.get("rolreplication")),
+            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", jwt)
+        .header("Authorization", format!("Bearer {}", jwt))
        .send()
        .map_err(|e| {
            (
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
@@ -265,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+                || !r.bypassrls.unwrap_or(false)
+                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -296,13 +298,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
+                let mut query: String =
+                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,7 +2,6 @@ use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -14,12 +13,10 @@ pub struct AttachmentService {

 const COMMAND: &str = "attachment_service";

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    pub pageserver_id: Option<NodeId>,
+    pub node_id: Option<NodeId>,
 }

 #[derive(Serialize, Deserialize)]
@@ -85,7 +82,7 @@ impl AttachmentService {
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach_hook")
+            .join("attach-hook")
            .unwrap();
        let client = reqwest::blocking::ClientBuilder::new()
            .build()
@@ -93,7 +90,7 @@ impl AttachmentService {

        let request = AttachHookRequest {
            tenant_id,
-            pageserver_id: Some(pageserver_id),
+            node_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,7 +12,9 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
+use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};

 use utils::{
    http::{
@@ -170,7 +172,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
                id: *t,
-                generation: state.generation,
+                gen: state.generation,
            });
        }
    }
@@ -216,14 +218,31 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
        .tenants
        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
-            pageserver: attach_req.pageserver_id,
+            pageserver: attach_req.node_id,
            generation: 0,
        });

-    if attach_req.pageserver_id.is_some() {
+    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
        tenant_state.generation += 1;
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            ps_id = %attaching_pageserver,
+            generation = %tenant_state.generation,
+            "issuing",
+        );
+    } else if let Some(ps_id) = tenant_state.pageserver {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            %ps_id,
+            generation = %tenant_state.generation,
+            "dropping",
+        );
+    } else {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            "no-op: tenant already has no pageserver");
    }
-    tenant_state.pageserver = attach_req.pageserver_id;
+    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -231,7 +250,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    json_response(
        StatusCode::OK,
        AttachHookResponse {
-            gen: attach_req.pageserver_id.map(|_| generation),
+            gen: attach_req.node_id.map(|_| generation),
        },
    )
 }
@@ -239,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", handle_re_attach)
-        .post("/validate", handle_validate)
-        .post("/attach_hook", handle_attach_hook)
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
 }

 #[tokio::main]
@@ -268,7 +287,16 @@ async fn main() -> anyhow::Result<()> {
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);
-    server.await?;
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;

    Ok(())
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -798,6 +798,24 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
+        "reconfigure" => {
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
+                } else {
+                    None
+                };
+            endpoint.reconfigure(pageserver_id)?;
+        }
        "stop" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
@@ -1369,6 +1387,12 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
+                .subcommand(Command::new("reconfigure")
+                            .about("Reconfigure the endpoint")
+                            .arg(endpoint_pageserver_id_arg)
+                            .arg(endpoint_id_arg.clone())
+                            .arg(tenant_id_arg.clone())
+                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,7 +46,6 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -57,13 +56,10 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
-#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
-    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
@@ -414,18 +410,34 @@ impl Endpoint {
            );
        }

-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
+        Ok(())
+    }
+
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-
        Ok(())
    }

+    fn read_postgresql_conf(&self) -> Result<String> {
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => Ok(String::from_utf8(content)?),
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
+            Err(e) => Err(anyhow::Error::new(e).context(format!(
+                "failed to read config file in {}",
+                postgresql_conf_path.to_str().unwrap()
+            ))),
+        }
+    }
+
    pub fn start(
        &self,
        auth_token: &Option<String>,
@@ -436,21 +448,7 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }

-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => String::from_utf8(content)?,
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
-            Err(e) => {
-                return Err(anyhow::Error::new(e).context(format!(
-                    "failed to read config file in {}",
-                    postgresql_conf_path.to_str().unwrap()
-                )))
-            }
-        };
+        let postgresql_conf = self.read_postgresql_conf()?;

        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -621,6 +619,61 @@ impl Endpoint {
        }
    }

+    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+        let mut spec: ComputeSpec = {
+            let spec_path = self.endpoint_path().join("spec.json");
+            let file = std::fs::File::open(spec_path)?;
+            serde_json::from_reader(file)?
+        };
+
+        let postgresql_conf = self.read_postgresql_conf()?;
+        spec.cluster.postgresql_conf = Some(postgresql_conf);
+
+        if let Some(pageserver_id) = pageserver_id {
+            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
+            let mut endpoint_conf: EndpointConf = {
+                let file = std::fs::File::open(&endpoint_config_path)?;
+                serde_json::from_reader(file)?
+            };
+            endpoint_conf.pageserver_id = pageserver_id;
+            std::fs::write(
+                endpoint_config_path,
+                serde_json::to_string_pretty(&endpoint_conf)?,
+            )?;
+
+            let pageserver =
+                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+            let ps_http_conf = &pageserver.pg_connection_config;
+            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
+            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        }
+
+        let client = reqwest::blocking::Client::new();
+        let response = client
+            .post(format!(
+                "http://{}:{}/configure",
+                self.http_address.ip(),
+                self.http_address.port()
+            ))
+            .body(format!(
+                "{{\"spec\":{}}}",
+                serde_json::to_string_pretty(&spec)?
+            ))
+            .send()?;
+
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(())
+        } else {
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
+    }
+
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -629,15 +682,25 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
+        self.pg_ctl(
+            if destroy {
+                &["-m", "immediate", "stop"]
+            } else {
+                &["stop"]
+            },
+            &None,
+        )?;
+
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
-            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
-        } else {
-            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod attachment_service;
 mod background_process;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -33,7 +32,6 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -59,7 +57,6 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
@@ -84,7 +81,6 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
-    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -0,0 +1,108 @@
+# Updating Postgres
+
+## Minor Versions
+
+When upgrading to a new minor version of Postgres, please follow these steps:
+
+_Example: 15.4 is the new minor version to upgrade to from 15.3._
+
+1. Clone the Neon Postgres repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/postgres.git
+    ```
+
+1. Add the Postgres upstream remote.
+
+    ```shell
+    git remote add upstream https://git.postgresql.org/git/postgresql.git
+    ```
+
+1. Create a new branch based on the stable branch you are updating.
+
+    ```shell
+    git checkout -b my-branch REL_15_STABLE_neon
+    ```
+
+1. Tag the last commit on the stable branch you are updating.
+
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.
+
+    ```shell
+    git fetch upstream REL_15_4
+    git rebase REL_15_4
+    ```
+
+1. Run the Postgres test suite to make sure our commits have not affected
+Postgres in a negative way.
+
+    ```shell
+    make check
+    # OR
+    meson test -C builddir
+    ```
+
+1. Push your branch to the Neon Postgres repository.
+
+    ```shell
+    git push origin my-branch
+    ```
+
+1. Clone the Neon repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/neon.git
+    ```
+
+1. Create a new branch.
+
+1. Change the `revisions.json` file to point at the HEAD of your Postgres
+branch.
+
+1. Update the Git submodule.
+
+    ```shell
+    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule update --remote vendor/postgres-v15
+    ```
+
+1. Run the Neon test suite to make sure that Neon is still good to go on this
+minor Postgres release.
+
+    ```shell
+    ./scripts/poetry -k pg15
+    ```
+
+1. Commit your changes.
+
+1. Create a pull request, and wait for CI to go green.
+
+1. Force push the rebased Postgres branches into the Neon Postgres repository.
+
+    ```shell
+    git push --force origin my-branch:REL_15_STABLE_neon
+    ```
+
+    It may require disabling various branch protections.
+
+1. Update your Neon PR to point at the branches.
+
+    ```shell
+    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
+    git commit --amend --no-edit
+    git push --force origin
+    ```
+
+1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,7 +6,6 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -19,7 +18,6 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
-#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -50,12 +48,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub pageserver_connstring: Option<String>,
+
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -140,14 +138,13 @@ impl RemoteExtSpec {
    }
 }

-#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    Static(Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
@@ -190,6 +187,8 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
+    pub replication: Option<bool>,
+    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
@@ -89,14 +90,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub fn set_build_info_metric(revision: &str) {
+pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision"]
+        &["revision", "build_tag"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision]).set(1);
+    metric.with_label_values(&[revision, build_tag]).set(1);
 }

 // Records I/O stats in a "cross-platform" way.
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,7 +4,6 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

 #[derive(Serialize, Deserialize)]
@@ -12,12 +11,10 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
-    pub generation: u32,
+    pub gen: u32,
 }

 #[derive(Serialize, Deserialize)]
@@ -25,10 +22,8 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -43,10 +38,8 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
+use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
@@ -174,25 +174,19 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -201,7 +195,6 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -278,31 +271,26 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
+pub struct TenantCreateResponse(pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -374,10 +362,8 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -388,33 +374,22 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,

    /// The LSN that we are advertizing to safekeepers
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,

    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -426,7 +401,6 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -523,23 +497,13 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-    },
-    Frozen {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_end: Lsn,
-    },
+    Open { lsn_start: Lsn },
+    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -547,9 +511,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -558,7 +520,6 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custo Ord and PartialOrd implementations below with
-// deriving them.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+// Then we could replace the custom Ord and PartialOrd implementations below with
+// deriving them. This will require changes in walredoproc.c.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp = self.spcnode.cmp(&other.spcnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.dbnode.cmp(&other.dbnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.relnode.cmp(&other.relnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.forknum.cmp(&other.forknum);
-
-        cmp
+        // Custom ordering where we put forknum to the end of the list
+        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
+        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
    }
 }

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -15,7 +17,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -33,6 +35,11 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -242,6 +250,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }

+    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -393,13 +402,23 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S,
+        F: Fn() -> S + Clone,
        S: Future,
    {
-        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-        // socket might be already closed, e.g. if previously received error,
-        // so ignore result.
-        self.framed.shutdown().await.ok();
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;
+
+        tokio::select! {
+            _ = shutdown_watcher() => {
+                // do nothing; we most likely got already stopped by shutdown and will log it next.
+            }
+            _ = self.framed.shutdown() => {
+                // socket might be already closed, e.g. if previously received error,
+                // so ignore result.
+            },
+        }
+
        match ret {
            Ok(()) => Ok(()),
            Err(QueryError::Shutdown) => {
@@ -597,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -717,12 +736,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
+                    match e {
+                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
+                        e => {
+                            log_query_error(query_string, &e);
+                            let short_error = short_error(&e);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &short_error,
+                                Some(e.pg_error_code()),
+                            ))?;
+                        }
+                    }
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
@@ -948,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -964,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,6 +14,7 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
+            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,27 +214,24 @@ where
    }
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write(write_buf.chunk()).await?;
+        let bytes_written = stream.write_buf(write_buf).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
-        // The advanced part will be garbage collected, likely during shifting
-        // data left on next attempt to write to buffer when free space is not
-        // enough.
-        write_buf.advance(bytes_written);
    }
-    write_buf.clear();
    stream.flush().await
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
+aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,21 +1,18 @@
 //! Azure Blob Storage wrapper

+use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::sync::Arc;
-use std::{borrow::Cow, collections::HashMap, io::Cursor};
+use std::{borrow::Cow, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::Header;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{
-    blob::operations::GetBlobBuilder,
-    prelude::{BlobClient, ContainerClient},
-};
+use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
@@ -112,31 +109,19 @@ impl AzureBlobStorage {

    async fn download_for_builder(
        &self,
-        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

+        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
-            let part = match part {
-                Ok(l) => l,
-                Err(e) => {
-                    return Err(if let Some(http_err) = e.as_http_error() {
-                        match http_err.status() {
-                            StatusCode::NotFound => DownloadError::NotFound,
-                            StatusCode::BadRequest => {
-                                DownloadError::BadInput(anyhow::Error::new(e))
-                            }
-                            _ => DownloadError::Other(anyhow::Error::new(e)),
-                        }
-                    } else {
-                        DownloadError::Other(e.into())
-                    });
-                }
-            };
+            let part = part.map_err(to_download_error)?;
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
            let data = part
                .data
                .collect()
@@ -146,42 +131,9 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(metadata),
+            metadata: Some(StorageMetadata(metadata)),
        })
    }
-    // TODO get rid of this function once we have metadata included in the response
-    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
-    async fn get_metadata(
-        &self,
-        blob_client: &BlobClient,
-    ) -> Result<StorageMetadata, DownloadError> {
-        let builder = blob_client.get_metadata();
-
-        match builder.into_future().await {
-            Ok(r) => {
-                let mut map = HashMap::new();
-
-                for md in r.metadata.iter() {
-                    map.insert(
-                        md.name().as_str().to_string(),
-                        md.value().as_str().to_string(),
-                    );
-                }
-                Ok(StorageMetadata(map))
-            }
-            Err(e) => {
-                return Err(if let Some(http_err) = e.as_http_error() {
-                    match http_err.status() {
-                        StatusCode::NotFound => DownloadError::NotFound,
-                        StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)),
-                        _ => DownloadError::Other(anyhow::Error::new(e)),
-                    }
-                } else {
-                    DownloadError::Other(e.into())
-                });
-            }
-        }
-    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -199,6 +151,18 @@ fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
    res
 }

+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
    async fn list(
@@ -238,23 +202,7 @@ impl RemoteStorage for AzureBlobStorage {
        let mut response = builder.into_stream();
        let mut res = Listing::default();
        while let Some(l) = response.next().await {
-            let entry = match l {
-                Ok(l) => l,
-                Err(e) => {
-                    return Err(if let Some(http_err) = e.as_http_error() {
-                        match http_err.status() {
-                            StatusCode::NotFound => DownloadError::NotFound,
-                            StatusCode::BadRequest => {
-                                DownloadError::BadInput(anyhow::Error::new(e))
-                            }
-                            _ => DownloadError::Other(anyhow::Error::new(e)),
-                        }
-                    } else {
-                        DownloadError::Other(e.into())
-                    });
-                }
-            };
-
+            let entry = l.map_err(to_download_error)?;
            let prefix_iter = entry
                .blobs
                .prefixes()
@@ -302,11 +250,9 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let builder = blob_client.get();

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn download_byte_range(
@@ -318,8 +264,6 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let mut builder = blob_client.get();

        if let Some(end_exclusive) = end_exclusive {
@@ -334,7 +278,7 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,19 +6,15 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -34,12 +30,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -184,12 +174,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        &self,
        prefix: Option<&RemotePath>,
        _mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError>; /* {
-                                                     // XXX Placeholder impl.
-                                                     let mut result = Listing::default();
-                                                     result.prefixes = self.list_prefixes(prefix).await?;
-                                                     Ok(result)
-                                                 }*/
+    ) -> anyhow::Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -446,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -545,18 +526,6 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -638,11 +607,7 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
    }
 }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -156,10 +156,19 @@ impl RemoteStorage for LocalFs {
        let mut result = Listing::default();

        if let ListingMode::NoDelimiter = mode {
-            result.keys = self
+            let keys = self
                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
+
+            result.keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
+
            return Ok(result);
        }

@@ -193,7 +202,7 @@ impl RemoteStorage for LocalFs {
            if prefix.is_dir() {
                result.prefixes.push(stripped);
            } else {
-                result.keys.push(stripped)
+                result.keys.push(stripped);
            }
        }

@@ -738,6 +747,43 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn list() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let storage = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+
+        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
+        assert!(listing.prefixes.is_empty());
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+
+        // Delimiter: should only go one deep
+        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
+
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines").unwrap()].to_vec()
+        );
+        assert!(listing.keys.is_empty());
+
+        // Delimiter & prefix
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
+                ListingMode::WithDelimiter,
+            )
+            .await?;
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
+        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());
+
+        Ok(())
+    }
+
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,23 +4,27 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::{borrow::Cow, sync::Arc};

 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
+use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -83,10 +87,23 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .credentials_provider(credentials_provider)
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
+            .retry_config(retry_config.build());

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -267,6 +267,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    let buf = download_and_compare(dl).await?;
    assert_eq!(buf, data);

+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
    Ok(())
 }

@@ -463,8 +469,6 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -396,8 +396,6 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,23 +1,18 @@
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};

 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
-    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -28,7 +23,6 @@ fn lsn_invalid() -> Lsn {
 }

 /// Data about safekeeper's timeline, mirrors broker.proto.
-#[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -36,25 +30,19 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
@@ -55,6 +56,7 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
+serde_assert.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md

+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
 use camino::Utf8Path;
@@ -9,9 +10,8 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};

-use crate::id::TenantId;
+use crate::{http::error::ApiError, id::TenantId};

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -32,11 +32,9 @@ pub enum Scope {
 }

 /// JWT payload. See docs/authentication.md for the format
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
@@ -47,31 +45,106 @@ impl Claims {
    }
 }

+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct AuthError(pub Cow<'static, str>);
+
+impl Display for AuthError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_key,
+            decoding_keys,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
    }

-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
+        } else {
+            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
+        }
    }
 }

@@ -111,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() -> Result<(), anyhow::Error> {
+    fn test_decode() {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

@@ -132,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
-
-        Ok(())
    }

    #[test]
-    fn test_encode() -> Result<(), anyhow::Error> {
+    fn test_encode() {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let decoded = auth.decode(&encoded)?;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let decoded = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
-
-        Ok(())
    }
 }
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,36 +1,12 @@
-use std::sync::{atomic::AtomicI32, Arc};
+use std::sync::Arc;

 use tokio::sync::{mpsc, Mutex};

 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
-pub struct Completion {
-    sender: mpsc::Sender<()>,
-    refcount: Arc<AtomicI32>,
-}
-
-impl Clone for Completion {
-    fn clone(&self) -> Self {
-        let i = self
-            .refcount
-            .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-        tracing::info!("Completion::clone[{:p}]: {i}", &(*self.refcount));
-        Self {
-            sender: self.sender.clone(),
-            refcount: self.refcount.clone(),
-        }
-    }
-}
-
-impl Drop for Completion {
-    fn drop(&mut self) {
-        let i = self
-            .refcount
-            .fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
-        tracing::info!("Completion::drop[{:p}]: {i}", &(*self.refcount));
-    }
-}
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -69,11 +45,5 @@ pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
    let rx = Mutex::new(rx);
    let rx = Arc::new(rx);
-    (
-        Completion {
-            sender: tx,
-            refcount: Arc::new(AtomicI32::new(1)),
-        },
-        Barrier(rx),
-    )
+    (Completion(tx), Barrier(rx))
 }
--- a/libs/utils/src/exp_counter.rs
+++ b/libs/utils/src/exp_counter.rs
@@ -0,0 +1,156 @@
+/// An exhausting, injective counter with exponential search properties
+///
+/// The type implements an [`Iterator`] that yields numbers from the range
+/// from 0 to a pre-defined maximum.
+///
+/// * It is *exhausting* in that it visits all numbers inside `0..max`.
+/// * It is *injective* in that all numbers are visited only once.
+/// * It has *exponential search properties* in that it iterates over the
+///   number range in a fractal pattern.
+///
+/// This iterator is well suited for finding a pivot in algorithms that
+/// want centered pivots: its output is heavily biased towards starting
+/// with small numbers.
+///
+/// If the specified maximum is close to an exponent
+/// of two we will quickly reach that exponent and therefore we will
+/// quickly close in on the maximum, contrary to the claim of the counter
+/// providing centered pivots. For most algorithms, this is not a problem
+/// though as long as this is only a limited number of times, which it does,
+/// as the next maximum will be an exponent of two, for which the next
+/// smaller one is precisely its half of. So we will be able to perform a
+/// binary search in all instances.
+pub struct ExpCounter {
+    /// The (exclusive) upper limit of our search
+    max: u64,
+    /// The base that increases after each round trip
+    base: u64,
+    /// An increasing offset, always a power of two
+    offs: u64,
+    /// Iteration counter
+    i: u64,
+}
+
+impl ExpCounter {
+    /// Creates an iterator that uses `ExpCounter` for the first half of the
+    /// range and a linear range counter for the second half.
+    pub fn with_max_and_linear_search(max: u64) -> impl Iterator<Item = u64> {
+        let linear_start = max / 2;
+        ExpCounter::with_max(linear_start).chain(linear_start..max)
+    }
+    /// Creates a new `ExpCounter` instance that counts to the (exclusive) maximum
+    pub fn with_max(max: u64) -> Self {
+        Self {
+            max,
+            base: 0,
+            offs: 1,
+            i: 0,
+        }
+    }
+}
+
+impl Iterator for ExpCounter {
+    type Item = u64;
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.i >= self.max {
+            return None;
+        }
+        if self.i == 0 {
+            // Special casing this is easier than adding 0 as the first in some other fashion
+            self.i += 1;
+            return Some(0);
+        }
+        let to_yield = self.base + self.offs;
+        self.offs *= 2;
+
+        if self.base + self.offs >= self.max {
+            self.base += 1;
+            self.offs = 1;
+            while self.base > self.offs {
+                self.offs *= 2;
+            }
+            if (self.base + self.offs).count_ones() == 1 {
+                self.offs *= 2;
+            }
+        }
+        self.i += 1;
+
+        Some(to_yield)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::HashSet;
+
+    use super::*;
+
+    fn dupes_and_missing(list: &mut [u64], max: u64) -> (usize, usize) {
+        let mut contained_in_list = HashSet::<u64>::new();
+        let mut dupes = 0;
+        let mut missing = 0;
+
+        for v in list.iter() {
+            println!("{v:4 } = {v:010b}");
+        }
+        println!("Yielded {} items", list.len());
+        println!("The duplicates:");
+        list.sort();
+        let mut prev = None;
+        for v in list.iter() {
+            contained_in_list.insert(*v);
+            if Some(*v) == prev {
+                dupes += 1;
+                println!("{v:3 } = {v:08b}");
+            }
+            prev = Some(*v);
+        }
+        println!("The missing numbers:");
+        for v in 0..max {
+            if !contained_in_list.contains(&v) {
+                println!("{v:3 } = {v:010b}");
+                missing += 1;
+            }
+        }
+        (dupes, missing)
+    }
+
+    #[test]
+    fn to_40() {
+        let max = 40;
+        let list = ExpCounter::with_max(max).collect::<Vec<_>>();
+        let expected = [
+            0, 1, 2, 4, 8, 16, 32, 3, 5, 9, 17, 33, 6, 10, 18, 34, 7, 11, 19, 35, 12, 20, 36, 13,
+            21, 37, 14, 22, 38, 15, 23, 39, 24, 25, 26, 27, 28, 29, 30, 31,
+        ];
+        assert_eq!(list, expected);
+    }
+
+    #[test]
+    fn to_64() {
+        let max = 64;
+        let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
+        assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
+    }
+
+    #[test]
+    fn to_100() {
+        let max = 100;
+        let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
+        assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
+    }
+
+    #[test]
+    fn to_127() {
+        let max = 127;
+        let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
+        assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
+    }
+
+    #[test]
+    fn to_12345() {
+        let max = 12345;
+        let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
+        assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
+    }
+}
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -0,0 +1,41 @@
+/// Useful type for asserting that expected bytes match reporting the bytes more readable
+/// array-syntax compatible hex bytes.
+///
+/// # Usage
+///
+/// ```
+/// use utils::Hex;
+///
+/// let actual = serialize_something();
+/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
+///
+/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
+/// // output suffixed with an array style length for easier comparisons.
+/// assert_eq!(Hex(&actual), Hex(&expected));
+///
+/// // with `let expected = [0x68];` the error would had been:
+/// // assertion `left == right` failed
+/// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
+/// // right: [0x68; 1]
+/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
+/// ```
+#[derive(PartialEq)]
+pub struct Hex<'a>(pub &'a [u8]);
+
+impl std::fmt::Debug for Hex<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for (i, c) in self.0.chunks(16).enumerate() {
+            if i > 0 && !c.is_empty() {
+                writeln!(f, ", ")?;
+            }
+            for (j, b) in c.iter().enumerate() {
+                if j > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "0x{b:02x}")?;
+            }
+        }
+        write!(f, "; {}]", self.0.len())
+    }
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -14,6 +14,11 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;

+use bytes::{Bytes, BytesMut};
+use std::io::Write as _;
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -146,94 +151,89 @@ impl Drop for RequestCancelled {
    }
 }

+/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+pub struct ChannelWriter {
+    buffer: BytesMut,
+    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
+    written: usize,
+}
+
+impl ChannelWriter {
+    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+        assert_ne!(buf_len, 0);
+        ChannelWriter {
+            // split about half off the buffer from the start, because we flush depending on
+            // capacity. first flush will come sooner than without this, but now resizes will
+            // have better chance of picking up the "other" half. not guaranteed of course.
+            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+            tx,
+            written: 0,
+        }
+    }
+
+    pub fn flush0(&mut self) -> std::io::Result<usize> {
+        let n = self.buffer.len();
+        if n == 0 {
+            return Ok(0);
+        }
+
+        tracing::trace!(n, "flushing");
+        let ready = self.buffer.split().freeze();
+
+        // not ideal to call from blocking code to block_on, but we are sure that this
+        // operation does not spawn_blocking other tasks
+        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+            // throttle sending to allow reuse of our buffer in `write`.
+            self.tx.reserve().await.map_err(|_| ())?;
+
+            // now the response task has picked up the buffer and hopefully started
+            // sending it to the client.
+            Ok(())
+        });
+        if res.is_err() {
+            return Err(std::io::ErrorKind::BrokenPipe.into());
+        }
+        self.written += n;
+        Ok(n)
+    }
+
+    pub fn flushed_bytes(&self) -> usize {
+        self.written
+    }
+}
+
+impl std::io::Write for ChannelWriter {
+    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+        let remaining = self.buffer.capacity() - self.buffer.len();
+
+        let out_of_space = remaining < buf.len();
+
+        let original_len = buf.len();
+
+        if out_of_space {
+            let can_still_fit = buf.len() - remaining;
+            self.buffer.extend_from_slice(&buf[..can_still_fit]);
+            buf = &buf[can_still_fit..];
+            self.flush0()?;
+        }
+
+        // assume that this will often under normal operation just move the pointer back to the
+        // beginning of allocation, because previous split off parts are already sent and
+        // dropped.
+        self.buffer.extend_from_slice(buf);
+        Ok(original_len)
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.flush0().map(|_| ())
+    }
+}
+
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    use bytes::{Bytes, BytesMut};
-    use std::io::Write as _;
-    use tokio::sync::mpsc;
-    use tokio_stream::wrappers::ReceiverStream;
-
    SERVE_METRICS_COUNT.inc();

-    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-    struct ChannelWriter {
-        buffer: BytesMut,
-        tx: mpsc::Sender<std::io::Result<Bytes>>,
-        written: usize,
-    }
-
-    impl ChannelWriter {
-        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-            assert_ne!(buf_len, 0);
-            ChannelWriter {
-                // split about half off the buffer from the start, because we flush depending on
-                // capacity. first flush will come sooner than without this, but now resizes will
-                // have better chance of picking up the "other" half. not guaranteed of course.
-                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-                tx,
-                written: 0,
-            }
-        }
-
-        fn flush0(&mut self) -> std::io::Result<usize> {
-            let n = self.buffer.len();
-            if n == 0 {
-                return Ok(0);
-            }
-
-            tracing::trace!(n, "flushing");
-            let ready = self.buffer.split().freeze();
-
-            // not ideal to call from blocking code to block_on, but we are sure that this
-            // operation does not spawn_blocking other tasks
-            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-                // throttle sending to allow reuse of our buffer in `write`.
-                self.tx.reserve().await.map_err(|_| ())?;
-
-                // now the response task has picked up the buffer and hopefully started
-                // sending it to the client.
-                Ok(())
-            });
-            if res.is_err() {
-                return Err(std::io::ErrorKind::BrokenPipe.into());
-            }
-            self.written += n;
-            Ok(n)
-        }
-
-        fn flushed_bytes(&self) -> usize {
-            self.written
-        }
-    }
-
-    impl std::io::Write for ChannelWriter {
-        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-            let remaining = self.buffer.capacity() - self.buffer.len();
-
-            let out_of_space = remaining < buf.len();
-
-            let original_len = buf.len();
-
-            if out_of_space {
-                let can_still_fit = buf.len() - remaining;
-                self.buffer.extend_from_slice(&buf[..can_still_fit]);
-                buf = &buf[can_still_fit..];
-                self.flush0()?;
-            }
-
-            // assume that this will often under normal operation just move the pointer back to the
-            // beginning of allocation, because previous split off parts are already sent and
-            // dropped.
-            self.buffer.extend_from_slice(buf);
-            Ok(original_len)
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            self.flush0().map(|_| ())
-        }
-    }
-
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth
-                        .decode(token)
-                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
+                    let data = auth.decode(token).map_err(|err| {
+                        warn!("Authentication error: {err}");
+                        // Rely on From<AuthError> for ApiError impl
+                        err
+                    })?;
                    req.set_context(data.claims);
                }
                None => {
@@ -450,12 +452,11 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
+    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => {
-            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
-        }
+        Some(claims) => Ok(check_permission(&claims)
+            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::{error, info, warn};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
+        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
+            warn!("Error processing HTTP request: {api_error:#}")
+        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,6 +3,7 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
+use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;

@@ -17,12 +18,74 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-///
-/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
-/// Check the `serde_with::serde_as` documentation for options for more complex types.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
 struct Id([u8; 16]);

+impl Serialize for Id {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            self.0.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Id {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> Visitor<'de> for IdVisitor {
+            type Value = Id;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 16])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 16] = Deserialize::deserialize(s)?;
+                Ok(Id::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Id::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                16,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -57,6 +120,8 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
@@ -308,3 +373,112 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};
+
+    use crate::bin_ser::BeSer;
+
+    use super::*;
+
+    #[test]
+    fn test_id_serde_non_human_readable() {
+        let original_id = Id([
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ]);
+        let expected_tokens = Tokens(vec![
+            Token::Tuple { len: 16 },
+            Token::U8(173),
+            Token::U8(80),
+            Token::U8(132),
+            Token::U8(115),
+            Token::U8(129),
+            Token::U8(226),
+            Token::U8(72),
+            Token::U8(254),
+            Token::U8(170),
+            Token::U8(201),
+            Token::U8(135),
+            Token::U8(108),
+            Token::U8(199),
+            Token::U8(26),
+            Token::U8(228),
+            Token::U8(24),
+            Token::TupleEnd,
+        ]);
+
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let serialized_tokens = original_id.serialize(&serializer).unwrap();
+        assert_eq!(serialized_tokens, expected_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(serialized_tokens)
+            .build();
+        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
+        assert_eq!(deserialized_id, original_id);
+    }
+
+    #[test]
+    fn test_id_serde_human_readable() {
+        let original_id = Id([
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ]);
+        let expected_tokens = Tokens(vec![Token::Str(String::from(
+            "ad50847381e248feaac9876cc71ae418",
+        ))]);
+
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let serialized_tokens = original_id.serialize(&serializer).unwrap();
+        assert_eq!(serialized_tokens, expected_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(Tokens(vec![Token::Str(String::from(
+                "ad50847381e248feaac9876cc71ae418",
+            ))]))
+            .build();
+        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
+    }
+
+    macro_rules! roundtrip_type {
+        ($type:ty, $expected_bytes:expr) => {{
+            let expected_bytes: [u8; 16] = $expected_bytes;
+            let original_id = <$type>::from(expected_bytes);
+
+            let ser_bytes = original_id.ser().unwrap();
+            assert_eq!(ser_bytes, expected_bytes);
+
+            let des_id = <$type>::des(&ser_bytes).unwrap();
+            assert_eq!(des_id, original_id);
+        }};
+    }
+
+    #[test]
+    fn test_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(Id, expected_bytes);
+    }
+
+    #[test]
+    fn test_tenant_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(TenantId, expected_bytes);
+    }
+
+    #[test]
+    fn test_timeline_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(TimelineId, expected_bytes);
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

@@ -24,6 +25,10 @@ pub mod auth;

 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
+
+mod hex;
+pub use hex::Hex;
+
 // http endpoint utils
 pub mod http;

@@ -73,6 +78,12 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+pub mod exp_counter;
+/// async timeout helper
+pub mod timeout;
+
+pub mod sync;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -128,6 +139,21 @@ macro_rules! project_git_version {
    };
 }

+/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
+#[macro_export]
+macro_rules! project_build_tag {
+    ($const_identifier:ident) => {
+        const $const_identifier: &::core::primitive::str = {
+            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
+                ::core::option::Option::Some(x) => ["build_tag-env:", x],
+                ::core::option::Option::None => ["build_tag:", ""],
+            };
+
+            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
+        };
+    };
+}
+
 /// Re-export for `project_git_version` macro
 #[doc(hidden)]
 pub use const_format as __const_format;
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]

 use camino::Utf8Path;
-use serde::{Deserialize, Serialize};
+use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,10 +13,114 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
-#[serde(transparent)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
 pub struct Lsn(pub u64);

+impl Serialize for Lsn {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            self.0.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Lsn {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct LsnVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> Visitor<'de> for LsnVisitor {
+            type Value = Lsn;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str(
+                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
+                    )
+                } else {
+                    formatter.write_str("value in form of integer(u64)")
+                }
+            }
+
+            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(Lsn(v))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Lsn::from_str(v).map_err(|e| E::custom(e))
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(LsnVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_u64(LsnVisitor {
+                is_human_readable_deserializer: false,
+            })
+        }
+    }
+}
+
+/// Allows (de)serialization of an `Lsn` always as `u64`.
+///
+/// ### Example
+///
+/// ```rust
+/// # use serde::{Serialize, Deserialize};
+/// use utils::lsn::Lsn;
+///
+/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
+/// struct Foo {
+///   #[serde(with = "utils::lsn::serde_as_u64")]
+///   always_u64: Lsn,
+/// }
+///
+/// let orig = Foo { always_u64: Lsn(1234) };
+///
+/// let res = serde_json::to_string(&orig).unwrap();
+/// assert_eq!(res, r#"{"always_u64":1234}"#);
+///
+/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
+/// assert_eq!(foo, orig);
+/// ```
+///
+pub mod serde_as_u64 {
+    use super::Lsn;
+
+    /// Serializes the Lsn as u64 disregarding the human readability of the format.
+    ///
+    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
+    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
+        use serde::Serialize;
+        lsn.0.serialize(serializer)
+    }
+
+    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
+    ///
+    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
+    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
+        use serde::Deserialize;
+        u64::deserialize(deserializer).map(Lsn)
+    }
+}
+
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -264,8 +368,13 @@ impl MonotonicCounter<Lsn> for RecordLsn {

 #[cfg(test)]
 mod tests {
+    use crate::bin_ser::BeSer;
+
    use super::*;

+    use serde::ser::Serialize;
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};
+
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -341,4 +450,95 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
+
+    #[test]
+    fn test_lsn_serde() {
+        let original_lsn = Lsn(0x0123456789abcdef);
+        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
+        let expected_non_readable_tokens =
+            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
+
+        // Testing human_readable ser/de
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+        assert_eq!(readable_ser_tokens, expected_readable_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
+            .build();
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+
+        // Testing NON human_readable ser/de
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
+            .build();
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+
+        // Testing mismatching ser/de
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
+            .build();
+        Lsn::deserialize(&mut deserializer).unwrap_err();
+
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
+            .build();
+        Lsn::deserialize(&mut deserializer).unwrap_err();
+    }
+
+    #[test]
+    fn test_lsn_ensure_roundtrip() {
+        let original_lsn = Lsn(0xaaaabbbb);
+
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(ser_tokens)
+            .build();
+
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+    }
+
+    #[test]
+    fn test_lsn_bincode_serde() {
+        let lsn = Lsn(0x0123456789abcdef);
+        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
+
+        let ser_bytes = lsn.ser().unwrap();
+        assert_eq!(ser_bytes, expected_bytes);
+
+        let des_lsn = Lsn::des(&ser_bytes).unwrap();
+        assert_eq!(des_lsn, lsn);
+    }
+
+    #[test]
+    fn test_lsn_bincode_ensure_roundtrip() {
+        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
+        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
+
+        let ser_bytes = original_lsn.ser().unwrap();
+        assert_eq!(ser_bytes, expected_bytes);
+
+        let des_lsn = Lsn::des(&ser_bytes).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,7 +3,6 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};

 use crate::lsn::Lsn;
@@ -15,21 +14,17 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
-#[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
-    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,6 +125,9 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

+            // Block any future waiters from starting
+            internal.shutdown = true;
+
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1,3 @@
+pub mod heavier_once_cell;
+
+pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -0,0 +1,158 @@
+use std::{sync::Arc, time::Duration};
+
+/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
+///
+/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
+/// the resource calls `close()` when they want to ensure that all holders of guards
+/// have released them, and that no future guards will be issued.
+pub struct Gate {
+    /// Each caller of enter() takes one unit from the semaphore. In close(), we
+    /// take all the units to ensure all GateGuards are destroyed.
+    sem: Arc<tokio::sync::Semaphore>,
+
+    /// For observability only: a name that will be used to log warnings if a particular
+    /// gate is holding up shutdown
+    name: String,
+}
+
+/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
+/// not complete.
+#[derive(Debug)]
+pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+
+/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
+async fn warn_if_stuck<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_period: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();
+
+    let mut fut = std::pin::pin!(fut);
+
+    loop {
+        match tokio::time::timeout(warn_period, &mut fut).await {
+            Ok(ret) => return ret,
+            Err(_) => {
+                tracing::warn!(
+                    gate = name,
+                    elapsed_ms = started.elapsed().as_millis(),
+                    "still waiting, taking longer than expected..."
+                );
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum GateError {
+    GateClosed,
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;
+
+    pub fn new(name: String) -> Self {
+        Self {
+            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
+            name,
+        }
+    }
+
+    /// Acquire a guard that will prevent close() calls from completing. If close()
+    /// was already called, this will return an error which should be interpreted
+    /// as "shutting down".
+    ///
+    /// This function would typically be used from e.g. request handlers. While holding
+    /// the guard returned from this function, it is important to respect a CancellationToken
+    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
+    /// also contain a CancellationToken.
+    pub fn enter(&self) -> Result<GateGuard, GateError> {
+        self.sem
+            .clone()
+            .try_acquire_owned()
+            .map(GateGuard)
+            .map_err(|_| GateError::GateClosed)
+    }
+
+    /// Types with a shutdown() method and a gate should call this method at the
+    /// end of shutdown, to ensure that all GateGuard holders are done.
+    ///
+    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
+    /// important that the holders of such guards are respecting a CancellationToken which has
+    /// been cancelled before entering this function.
+    pub async fn close(&self) {
+        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+    }
+
+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
+    async fn do_close(&self) {
+        tracing::debug!(gate = self.name, "Closing Gate...");
+        match self.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_units) => {
+                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
+                self.sem.close();
+            }
+            Err(_) => {
+                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
+                // This is legal.  Timeline::shutdown for example is not protected from being called more than
+                // once.
+                tracing::debug!(gate = self.name, "Double close")
+            }
+        }
+        tracing::debug!(gate = self.name, "Closed Gate.")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::FutureExt;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_idle_gate() {
+        // Having taken no gates, we should not be blocked in close
+        let gate = Gate::new("test".to_string());
+        gate.close().await;
+
+        // If a guard is dropped before entering, close should not be blocked
+        let gate = Gate::new("test".to_string());
+        let guard = gate.enter().unwrap();
+        drop(guard);
+        gate.close().await;
+
+        // Entering a closed guard fails
+        gate.enter().expect_err("enter should fail after close");
+    }
+
+    #[tokio::test]
+    async fn test_busy_gate() {
+        let gate = Gate::new("test".to_string());
+
+        let guard = gate.enter().unwrap();
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        // Close should be blocked
+        assert!(close_fut.as_mut().now_or_never().is_none());
+
+        // Attempting to enter() should fail, even though close isn't done yet.
+        gate.enter()
+            .expect_err("enter should fail after entering close");
+
+        drop(guard);
+
+        // Guard is gone, close should finish
+        assert!(close_fut.as_mut().now_or_never().is_some());
+
+        // Attempting to enter() is still forbidden
+        gate.enter().expect_err("enter should fail finishing close");
+    }
+}
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,383 @@
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc, Mutex, MutexGuard,
+};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+    initializers: AtomicUsize,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+            initializers: AtomicUsize::new(0),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+            initializers: AtomicUsize::new(0),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };
+
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
+        }
+    }
+
+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// to complete initializing the inner value.
+    ///
+    /// # Panics
+    ///
+    /// If the inner has already been initialized.
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        let guard = self.inner.lock().unwrap();
+
+        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
+        // give more permits right now.
+        if guard.init_semaphore.try_acquire().is_ok() {
+            drop(guard);
+            panic!("permit is of wrong origin");
+        }
+
+        Self::set0(value, guard)
+    }
+
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+        if guard.value.is_some() {
+            drop(guard);
+            unreachable!("we won permit, must not be initialized");
+        }
+        guard.value = Some(value);
+        guard.init_semaphore.close();
+        Guard(guard)
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+
+    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
+    pub fn initializer_count(&self) -> usize {
+        self.initializers.load(Ordering::Relaxed)
+    }
+}
+
+/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
+/// initializing task for example at the end of initialization.
+struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
+
+impl<'a, T> CountWaitingInitializers<'a, T> {
+    fn start(target: &'a OnceCell<T>) -> Self {
+        target.initializers.fetch_add(1, Ordering::Relaxed);
+        CountWaitingInitializers(target)
+    }
+}
+
+impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
+    fn drop(&mut self) {
+        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, InitPermit(permit)))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+/// Type held by OnceCell (de)initializing task.
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|permit| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>((i, permit))
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[test]
+    fn reinit_with_deinit_permit() {
+        let cell = Arc::new(OnceCell::new(42));
+
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
+
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        assert_eq!(5, five);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|_permit| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|permit| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>(("never reached", permit))
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -0,0 +1,37 @@
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+
+pub enum TimeoutCancellableError {
+    Timeout,
+    Cancelled,
+}
+
+/// Wrap [`tokio::time::timeout`] with a CancellationToken.
+///
+/// This wrapper is appropriate for any long running operation in a task
+/// that ought to respect a CancellationToken (which means most tasks).
+///
+/// The only time you should use a bare tokio::timeout is when the future `F`
+/// itself respects a CancellationToken: otherwise, always use this wrapper
+/// with your CancellationToken to ensure that your task does not hold up
+/// graceful shutdown.
+pub async fn timeout_cancellable<F>(
+    duration: Duration,
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<F::Output, TimeoutCancellableError>
+where
+    F: std::future::Future,
+{
+    tokio::select!(
+        r = tokio::time::timeout(duration, future) => {
+            r.map_err(|_| TimeoutCancellableError::Timeout)
+
+        },
+        _ = cancel.cancelled() => {
+            Err(TimeoutCancellableError::Cancelled)
+
+        }
+    )
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,11 +21,6 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
-    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
-    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
-    /// memory available for the cgroup.
-    pub(crate) in_memory: bool,
-
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl FileCacheConfig {
-    pub fn default_in_memory() -> Self {
+impl Default for FileCacheConfig {
+    fn default() -> Self {
        Self {
-            in_memory: true,
-            // 75 %
-            resource_multiplier: 0.75,
-            // 640 MiB; (512 + 128)
-            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
-            // ensure any increase in file cache size is split 90-10 with 10% to other memory
-            spread_factor: 0.1,
-        }
-    }
-
-    pub fn default_on_disk() -> Self {
-        Self {
-            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -83,7 +65,9 @@ impl FileCacheConfig {
            spread_factor: 0.1,
        }
    }
+}

+impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
@@ -39,16 +41,6 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

-    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
-    /// kernel's page cache), and therefore should not count against available memory.
-    //
-    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
-    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
-    // during the switch away from an in-memory file cache, we had to default to the previous
-    // behavior.
-    #[arg(long)]
-    pub file_cache_on_disk: bool,
-
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,10 +156,7 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = match args.file_cache_on_disk {
-                true => FileCacheConfig::default_on_disk(),
-                false => FileCacheConfig::default_in_memory(),
-            };
+            let config = FileCacheConfig::default();

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -187,10 +184,7 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            if args.file_cache_on_disk {
-                file_cache_disk_size = actual_size;
-            }
-
+            file_cache_disk_size = actual_size;
            state.filecache = Some(file_cache);
        }

@@ -239,30 +233,35 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let (expected_file_cache_size, expected_file_cache_disk_size) = self
+        let expected_file_cache_size = self
            .filecache
            .as_ref()
-            .map(|file_cache| {
-                let size = file_cache.config.calculate_cache_size(usable_system_memory);
-                match file_cache.config.in_memory {
-                    true => (size, 0),
-                    false => (size, size),
-                }
-            })
-            .unwrap_or((0, 0));
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

+            // NB: The ordering of these conditions is intentional. During startup, we should deny
+            // downscaling until we have enough information to determine that it's safe to do so
+            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
+            // received any information, we should *fail* instead of just denying downscaling.
+            //
+            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
+            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
+            // OR if we haven't received metrics *recently enough*.
+            //
            // TODO: make the duration here configurable.
            if last_time.elapsed() > Duration::from_secs(5) {
                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
            } else if last_history.samples_count <= 1 {
-                bail!("haven't received enough cgroup memory stats yet");
+                let status = "haven't received enough cgroup memory stats yet";
+                info!(status, "discontinuing downscale");
+                return Ok((false, status.to_owned()));
            }

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_size);

            let current = last_history.avg_non_reclaimable;

@@ -289,13 +288,10 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
-            }
+            file_cache_disk_size = actual_usage;
            let message = format!(
-                "set file cache size to {} MiB (in memory = {})",
+                "set file cache size to {} MiB",
                bytes_to_mebibytes(actual_usage),
-                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -346,9 +342,7 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
-            }
+            file_cache_disk_size = actual_usage;

            if actual_usage != expected_usage {
                warn!(
@@ -505,11 +499,14 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        // use {:#} for our logging because the display impl only
+                                        // gives the outermost cause, and the debug impl
+                                        // pretty-prints the error, whereas {:#} contains all the
+                                        // causes, but is compact (no newlines).
+                                        warn!(error = format!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error
+                                                error: e.to_string(),
                                            },
                                            message.id
                                        )
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "walproposer"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+utils.workspace = true
+postgres_ffi.workspace = true
+
+workspace_hack.workspace = true
+
+[build-dependencies]
+anyhow.workspace = true
+bindgen.workspace = true
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -0,0 +1 @@
+#include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -0,0 +1,113 @@
+use std::{env, path::PathBuf, process::Command};
+
+use anyhow::{anyhow, Context};
+use bindgen::CargoCallbacks;
+
+fn main() -> anyhow::Result<()> {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed=bindgen_deps.h");
+
+    // Finding the location of built libraries and Postgres C headers:
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
+    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
+        postgres_install_dir.into()
+    } else {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
+    };
+
+    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
+    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
+    let walproposer_lib_search_str = walproposer_lib_dir
+        .to_str()
+        .ok_or(anyhow!("Bad non-UTF path"))?;
+
+    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
+    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
+    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
+
+    println!("cargo:rustc-link-lib=static=pgport");
+    println!("cargo:rustc-link-lib=static=pgcommon");
+    println!("cargo:rustc-link-lib=static=walproposer");
+    println!("cargo:rustc-link-search={walproposer_lib_search_str}");
+
+    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
+    let inc_server_path: String = if pg_config_bin.exists() {
+        let output = Command::new(pg_config_bin)
+            .arg("--includedir-server")
+            .output()
+            .context("failed to execute `pg_config --includedir-server`")?;
+
+        if !output.status.success() {
+            panic!("`pg_config --includedir-server` failed")
+        }
+
+        String::from_utf8(output.stdout)
+            .context("pg_config output is not UTF-8")?
+            .trim_end()
+            .into()
+    } else {
+        let server_path = pg_install_abs
+            .join("v16")
+            .join("include")
+            .join("postgresql")
+            .join("server")
+            .into_os_string();
+        server_path
+            .into_string()
+            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
+    };
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header("bindgen_deps.h")
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(CargoCallbacks))
+        .allowlist_type("WalProposer")
+        .allowlist_type("WalProposerConfig")
+        .allowlist_type("walproposer_api")
+        .allowlist_function("WalProposerCreate")
+        .allowlist_function("WalProposerStart")
+        .allowlist_function("WalProposerBroadcast")
+        .allowlist_function("WalProposerPoll")
+        .allowlist_function("WalProposerFree")
+        .allowlist_var("DEBUG5")
+        .allowlist_var("DEBUG4")
+        .allowlist_var("DEBUG3")
+        .allowlist_var("DEBUG2")
+        .allowlist_var("DEBUG1")
+        .allowlist_var("LOG")
+        .allowlist_var("INFO")
+        .allowlist_var("NOTICE")
+        .allowlist_var("WARNING")
+        .allowlist_var("ERROR")
+        .allowlist_var("FATAL")
+        .allowlist_var("PANIC")
+        .allowlist_var("WPEVENT")
+        .allowlist_var("WL_LATCH_SET")
+        .allowlist_var("WL_SOCKET_READABLE")
+        .allowlist_var("WL_SOCKET_WRITEABLE")
+        .allowlist_var("WL_TIMEOUT")
+        .allowlist_var("WL_SOCKET_CLOSED")
+        .allowlist_var("WL_SOCKET_MASK")
+        .clang_arg("-DWALPROPOSER_LIB")
+        .clang_arg(format!("-I{pgxn_neon}"))
+        .clang_arg(format!("-I{inc_server_path}"))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
+    bindings
+        .write_to_file(out_path)
+        .expect("Couldn't write bindings!");
+
+    Ok(())
+}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -0,0 +1,457 @@
+#![allow(dead_code)]
+
+use std::ffi::CStr;
+use std::ffi::CString;
+
+use crate::bindings::uint32;
+use crate::bindings::walproposer_api;
+use crate::bindings::PGAsyncReadResult;
+use crate::bindings::PGAsyncWriteResult;
+use crate::bindings::Safekeeper;
+use crate::bindings::Size;
+use crate::bindings::StringInfoData;
+use crate::bindings::TimeLineID;
+use crate::bindings::TimestampTz;
+use crate::bindings::WalProposer;
+use crate::bindings::WalProposerConnStatusType;
+use crate::bindings::WalProposerConnectPollStatusType;
+use crate::bindings::WalProposerExecStatusType;
+use crate::bindings::WalproposerShmemState;
+use crate::bindings::XLogRecPtr;
+use crate::walproposer::ApiImpl;
+use crate::walproposer::WaitResult;
+
+extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_shmem_state()
+    }
+}
+
+extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).start_streaming(startpos)
+    }
+}
+
+extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_flush_rec_ptr()
+    }
+}
+
+extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_current_timestamp()
+    }
+}
+
+extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let msg = (*api).conn_error_message(&mut (*sk));
+        let msg = CString::new(msg).unwrap();
+        // TODO: fix leaking error message
+        msg.into_raw()
+    }
+}
+
+extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_status(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_start(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_poll(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
+    let query = unsafe { CStr::from_ptr(query) };
+    let query = query.to_str().unwrap();
+
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_send_query(&mut (*sk), query)
+    }
+}
+
+extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_get_query_result(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_flush(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_finish(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_finish(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_async_read(
+    sk: *mut Safekeeper,
+    buf: *mut *mut ::std::os::raw::c_char,
+    amount: *mut ::std::os::raw::c_int,
+) -> PGAsyncReadResult {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let (res, result) = (*api).conn_async_read(&mut (*sk));
+
+        // This function has guarantee that returned buf will be valid until
+        // the next call. So we can store a Vec in each Safekeeper and reuse
+        // it on the next call.
+        let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
+
+        inbuf.clear();
+        inbuf.extend_from_slice(res);
+
+        // Put a Vec back to sk->inbuf and return data ptr.
+        *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
+        *amount = res.len() as i32;
+
+        result
+    }
+}
+
+extern "C" fn conn_async_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> PGAsyncWriteResult {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_async_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn conn_blocking_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_blocking_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn recovery_download(
+    sk: *mut Safekeeper,
+    _timeline: TimeLineID,
+    startpos: XLogRecPtr,
+    endpos: XLogRecPtr,
+) -> bool {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).recovery_download(&mut (*sk), startpos, endpos)
+    }
+}
+
+#[allow(clippy::unnecessary_cast)]
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+) {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_allocate(&mut (*sk));
+    }
+}
+
+extern "C" fn free_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).free_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn init_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).init_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).update_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).add_safekeeper_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn wait_event_set(
+    wp: *mut WalProposer,
+    timeout: ::std::os::raw::c_long,
+    event_sk: *mut *mut Safekeeper,
+    events: *mut uint32,
+) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let result = (*api).wait_event_set(&mut (*wp), timeout);
+        match result {
+            WaitResult::Latch => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_LATCH_SET;
+                1
+            }
+            WaitResult::Timeout => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_TIMEOUT;
+                0
+            }
+            WaitResult::Network(sk, event_mask) => {
+                *event_sk = sk;
+                *events = event_mask;
+                1
+            }
+        }
+    }
+}
+
+extern "C" fn strong_random(
+    wp: *mut WalProposer,
+    buf: *mut ::std::os::raw::c_void,
+    len: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).strong_random(buf)
+    }
+}
+
+extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_redo_start_lsn()
+    }
+}
+
+extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).finish_sync_safekeepers(lsn)
+    }
+}
+
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+    }
+}
+
+extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).confirm_wal_streamed(&mut (*wp), lsn)
+    }
+}
+
+extern "C" fn log_internal(
+    wp: *mut WalProposer,
+    level: ::std::os::raw::c_int,
+    line: *const ::std::os::raw::c_char,
+) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let line = CStr::from_ptr(line);
+        let line = line.to_str().unwrap();
+        (*api).log_internal(&mut (*wp), Level::from(level as u32), line)
+    }
+}
+
+extern "C" fn after_election(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).after_election(&mut (*wp))
+    }
+}
+
+#[derive(Debug)]
+pub enum Level {
+    Debug5,
+    Debug4,
+    Debug3,
+    Debug2,
+    Debug1,
+    Log,
+    Info,
+    Notice,
+    Warning,
+    Error,
+    Fatal,
+    Panic,
+    WPEvent,
+}
+
+impl Level {
+    pub fn from(elevel: u32) -> Level {
+        use crate::bindings::*;
+
+        match elevel {
+            DEBUG5 => Level::Debug5,
+            DEBUG4 => Level::Debug4,
+            DEBUG3 => Level::Debug3,
+            DEBUG2 => Level::Debug2,
+            DEBUG1 => Level::Debug1,
+            LOG => Level::Log,
+            INFO => Level::Info,
+            NOTICE => Level::Notice,
+            WARNING => Level::Warning,
+            ERROR => Level::Error,
+            FATAL => Level::Fatal,
+            PANIC => Level::Panic,
+            WPEVENT => Level::WPEvent,
+            _ => panic!("unknown log level {}", elevel),
+        }
+    }
+}
+
+pub(crate) fn create_api() -> walproposer_api {
+    walproposer_api {
+        get_shmem_state: Some(get_shmem_state),
+        start_streaming: Some(start_streaming),
+        get_flush_rec_ptr: Some(get_flush_rec_ptr),
+        get_current_timestamp: Some(get_current_timestamp),
+        conn_error_message: Some(conn_error_message),
+        conn_status: Some(conn_status),
+        conn_connect_start: Some(conn_connect_start),
+        conn_connect_poll: Some(conn_connect_poll),
+        conn_send_query: Some(conn_send_query),
+        conn_get_query_result: Some(conn_get_query_result),
+        conn_flush: Some(conn_flush),
+        conn_finish: Some(conn_finish),
+        conn_async_read: Some(conn_async_read),
+        conn_async_write: Some(conn_async_write),
+        conn_blocking_write: Some(conn_blocking_write),
+        recovery_download: Some(recovery_download),
+        wal_read: Some(wal_read),
+        wal_reader_allocate: Some(wal_reader_allocate),
+        free_event_set: Some(free_event_set),
+        init_event_set: Some(init_event_set),
+        update_event_set: Some(update_event_set),
+        add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        wait_event_set: Some(wait_event_set),
+        strong_random: Some(strong_random),
+        get_redo_start_lsn: Some(get_redo_start_lsn),
+        finish_sync_safekeepers: Some(finish_sync_safekeepers),
+        process_safekeeper_feedback: Some(process_safekeeper_feedback),
+        confirm_wal_streamed: Some(confirm_wal_streamed),
+        log_internal: Some(log_internal),
+        after_election: Some(after_election),
+    }
+}
+
+impl std::fmt::Display for Level {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+/// Take ownership of `Vec<u8>` from StringInfoData.
+#[allow(clippy::unnecessary_cast)]
+pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
+    if pg.data.is_null() {
+        return None;
+    }
+
+    let ptr = pg.data as *mut u8;
+    let length = pg.len as usize;
+    let capacity = pg.maxlen as usize;
+
+    pg.data = std::ptr::null_mut();
+    pg.len = 0;
+    pg.maxlen = 0;
+
+    unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
+}
+
+/// Store `Vec<u8>` in StringInfoData.
+fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
+    let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
+    let length = vec.len();
+    let capacity = vec.capacity();
+
+    assert!(pg.data.is_null());
+
+    pg.data = ptr;
+    pg.len = length as i32;
+    pg.maxlen = capacity as i32;
+
+    std::mem::forget(vec);
+
+    ptr
+}
--- a/libs/walproposer/src/lib.rs
+++ b/libs/walproposer/src/lib.rs
@@ -0,0 +1,14 @@
+pub mod bindings {
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    // bindgen creates some unsafe code with no doc comments.
+    #![allow(clippy::missing_safety_doc)]
+    // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+    #![allow(clippy::useless_transmute)]
+
+    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+}
+
+pub mod api_bindings;
+pub mod walproposer;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -0,0 +1,485 @@
+use std::ffi::CString;
+
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use utils::id::TenantTimelineId;
+
+use crate::{
+    api_bindings::{create_api, take_vec_u8, Level},
+    bindings::{
+        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
+        WalProposerStart,
+    },
+};
+
+/// Rust high-level wrapper for C walproposer API. Many methods are not required
+/// for simple cases, hence todo!() in default implementations.
+///
+/// Refer to `pgxn/neon/walproposer.h` for documentation.
+pub trait ApiImpl {
+    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
+        todo!()
+    }
+
+    fn start_streaming(&self, _startpos: u64) {
+        todo!()
+    }
+
+    fn get_flush_rec_ptr(&self) -> u64 {
+        todo!()
+    }
+
+    fn get_current_timestamp(&self) -> i64 {
+        todo!()
+    }
+
+    fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
+        todo!()
+    }
+
+    fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
+        todo!()
+    }
+
+    fn conn_connect_start(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_connect_poll(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerConnectPollStatusType {
+        todo!()
+    }
+
+    fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
+        todo!()
+    }
+
+    fn conn_get_query_result(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerExecStatusType {
+        todo!()
+    }
+
+    fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
+        todo!()
+    }
+
+    fn conn_finish(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+        todo!()
+    }
+
+    fn conn_async_write(
+        &self,
+        _sk: &mut Safekeeper,
+        _buf: &[u8],
+    ) -> crate::bindings::PGAsyncWriteResult {
+        todo!()
+    }
+
+    fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
+        todo!()
+    }
+
+    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+        todo!()
+    }
+
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+        todo!()
+    }
+
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn free_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn init_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
+        todo!()
+    }
+
+    fn strong_random(&self, _buf: &mut [u8]) -> bool {
+        todo!()
+    }
+
+    fn get_redo_start_lsn(&self) -> u64 {
+        todo!()
+    }
+
+    fn finish_sync_safekeepers(&self, _lsn: u64) {
+        todo!()
+    }
+
+    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+        todo!()
+    }
+
+    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
+        todo!()
+    }
+
+    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
+        todo!()
+    }
+
+    fn after_election(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+}
+
+pub enum WaitResult {
+    Latch,
+    Timeout,
+    Network(*mut Safekeeper, u32),
+}
+
+pub struct Config {
+    /// Tenant and timeline id
+    pub ttid: TenantTimelineId,
+    /// List of safekeepers in format `host:port`
+    pub safekeepers_list: Vec<String>,
+    /// Safekeeper reconnect timeout in milliseconds
+    pub safekeeper_reconnect_timeout: i32,
+    /// Safekeeper connection timeout in milliseconds
+    pub safekeeper_connection_timeout: i32,
+    /// walproposer mode, finish when all safekeepers are synced or subscribe
+    /// to WAL streaming
+    pub sync_safekeepers: bool,
+}
+
+/// WalProposer main struct. C methods are reexported as Rust functions.
+pub struct Wrapper {
+    wp: *mut WalProposer,
+    _safekeepers_list_vec: Vec<u8>,
+}
+
+impl Wrapper {
+    pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
+        let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
+            .unwrap()
+            .into_raw();
+        let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
+            .unwrap()
+            .into_raw();
+
+        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
+            .unwrap()
+            .into_bytes_with_nul();
+        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
+
+        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
+
+        let c_config = WalProposerConfig {
+            neon_tenant,
+            neon_timeline,
+            safekeepers_list,
+            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
+            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
+            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
+            syncSafekeepers: config.sync_safekeepers,
+            systemId: 0,
+            pgTimeline: 1,
+            callback_data,
+        };
+        let c_config = Box::into_raw(Box::new(c_config));
+
+        let api = create_api();
+        let wp = unsafe { WalProposerCreate(c_config, api) };
+        Wrapper {
+            wp,
+            _safekeepers_list_vec: safekeepers_list_vec,
+        }
+    }
+
+    pub fn start(&self) {
+        unsafe { WalProposerStart(self.wp) }
+    }
+}
+
+impl Drop for Wrapper {
+    fn drop(&mut self) {
+        unsafe {
+            let config = (*self.wp).config;
+            drop(Box::from_raw(
+                (*config).callback_data as *mut Box<dyn ApiImpl>,
+            ));
+            drop(CString::from_raw((*config).neon_tenant));
+            drop(CString::from_raw((*config).neon_timeline));
+            drop(Box::from_raw(config));
+
+            for i in 0..(*self.wp).n_safekeepers {
+                let sk = &mut (*self.wp).safekeeper[i as usize];
+                take_vec_u8(&mut sk.inbuf);
+            }
+
+            WalProposerFree(self.wp);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        cell::Cell,
+        sync::{atomic::AtomicUsize, mpsc::sync_channel},
+    };
+
+    use utils::id::TenantTimelineId;
+
+    use crate::{api_bindings::Level, walproposer::Wrapper};
+
+    use super::ApiImpl;
+
+    #[derive(Clone, Copy, Debug)]
+    struct WaitEventsData {
+        sk: *mut crate::bindings::Safekeeper,
+        event_mask: u32,
+    }
+
+    struct MockImpl {
+        // data to return from wait_event_set
+        wait_events: Cell<WaitEventsData>,
+        // walproposer->safekeeper messages
+        expected_messages: Vec<Vec<u8>>,
+        expected_ptr: AtomicUsize,
+        // safekeeper->walproposer messages
+        safekeeper_replies: Vec<Vec<u8>>,
+        replies_ptr: AtomicUsize,
+        // channel to send LSN to the main thread
+        sync_channel: std::sync::mpsc::SyncSender<u64>,
+    }
+
+    impl MockImpl {
+        fn check_walproposer_msg(&self, msg: &[u8]) {
+            let ptr = self
+                .expected_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.expected_messages.len() {
+                panic!("unexpected message from walproposer");
+            }
+
+            let expected_msg = &self.expected_messages[ptr];
+            assert_eq!(msg, expected_msg.as_slice());
+        }
+
+        fn next_safekeeper_reply(&self) -> &[u8] {
+            let ptr = self
+                .replies_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.safekeeper_replies.len() {
+                panic!("no more safekeeper replies");
+            }
+
+            &self.safekeeper_replies[ptr]
+        }
+    }
+
+    impl ApiImpl for MockImpl {
+        fn get_current_timestamp(&self) -> i64 {
+            println!("get_current_timestamp");
+            0
+        }
+
+        fn conn_status(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnStatusType {
+            println!("conn_status");
+            crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
+        }
+
+        fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("conn_connect_start");
+        }
+
+        fn conn_connect_poll(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnectPollStatusType {
+            println!("conn_connect_poll");
+            crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
+        }
+
+        fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
+            println!("conn_send_query: {}", query);
+            true
+        }
+
+        fn conn_get_query_result(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerExecStatusType {
+            println!("conn_get_query_result");
+            crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
+        }
+
+        fn conn_async_read(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+            println!("conn_async_read");
+            let reply = self.next_safekeeper_reply();
+            println!("conn_async_read result: {:?}", reply);
+            (
+                reply,
+                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
+            )
+        }
+
+        fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
+            println!("conn_blocking_write: {:?}", buf);
+            self.check_walproposer_msg(buf);
+            true
+        }
+
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("wal_reader_allocate")
+        }
+
+        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("free_event_set")
+        }
+
+        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("init_event_set")
+        }
+
+        fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "update_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn wait_event_set(
+            &self,
+            _: &mut crate::bindings::WalProposer,
+            timeout_millis: i64,
+        ) -> super::WaitResult {
+            let data = self.wait_events.get();
+            println!(
+                "wait_event_set, timeout_millis={}, res={:?}",
+                timeout_millis, data
+            );
+            super::WaitResult::Network(data.sk, data.event_mask)
+        }
+
+        fn strong_random(&self, buf: &mut [u8]) -> bool {
+            println!("strong_random");
+            buf.fill(0);
+            true
+        }
+
+        fn finish_sync_safekeepers(&self, lsn: u64) {
+            self.sync_channel.send(lsn).unwrap();
+            panic!("sync safekeepers finished at lsn={}", lsn);
+        }
+
+        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
+            println!("walprop_log[{}] {}", level, msg);
+        }
+
+        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
+            println!("after_election");
+        }
+    }
+
+    /// Test that walproposer can successfully connect to safekeeper and finish
+    /// sync_safekeepers. API is mocked in MockImpl.
+    ///
+    /// Run this test with valgrind to detect leaks:
+    /// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
+    #[test]
+    fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
+        let ttid = TenantTimelineId::new(
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+        );
+
+        let (sender, receiver) = sync_channel(1);
+
+        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
+            wait_events: Cell::new(WaitEventsData {
+                sk: std::ptr::null_mut(),
+                event_mask: 0,
+            }),
+            expected_messages: vec![
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            expected_ptr: AtomicUsize::new(0),
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            replies_ptr: AtomicUsize::new(0),
+            sync_channel: sender,
+        });
+        let config = crate::walproposer::Config {
+            ttid,
+            safekeepers_list: vec!["localhost:5000".to_string()],
+            safekeeper_reconnect_timeout: 1000,
+            safekeeper_connection_timeout: 10000,
+            sync_safekeepers: true,
+        };
+
+        let wp = Wrapper::new(my_impl, config);
+
+        // walproposer will panic when it finishes sync_safekeepers
+        std::panic::catch_unwind(|| wp.start()).unwrap_err();
+        // validate the resulting LSN
+        assert_eq!(receiver.recv()?, 1337);
+        Ok(())
+        // drop() will free up resources here
+    }
+}
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,22 +1,21 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => {
-            bail!("SafekeeperData scope makes no sense for Pageserver")
-        }
+        (Scope::SafekeeperData, _) => Err(AuthError(
+            "SafekeeperData scope makes no sense for Pageserver".into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,6 +2,7 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
+use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
@@ -33,11 +34,15 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);

 const PID_FILE_NAME: &str = "pageserver.pid";

@@ -200,6 +205,51 @@ fn initialize_config(
    })
 }

+struct WaitForPhaseResult<F: std::future::Future + Unpin> {
+    timeout_remaining: Duration,
+    skipped: Option<F>,
+}
+
+/// During startup, we apply a timeout to our waits for readiness, to avoid
+/// stalling the whole service if one Tenant experiences some problem.  Each
+/// phase may consume some of the timeout: this function returns the updated
+/// timeout for use in the next call.
+async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
+where
+    F: std::future::Future + Unpin,
+{
+    let initial_t = Instant::now();
+    let skipped = match tokio::time::timeout(timeout, &mut fut).await {
+        Ok(_) => None,
+        Err(_) => {
+            tracing::info!(
+                timeout_millis = timeout.as_millis(),
+                %phase,
+                "Startup phase timed out, proceeding anyway"
+            );
+            Some(fut)
+        }
+    };
+
+    WaitForPhaseResult {
+        timeout_remaining: timeout
+            .checked_sub(Instant::now().duration_since(initial_t))
+            .unwrap_or(Duration::ZERO),
+        skipped,
+    }
+}
+
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
+    let elapsed = started_at.elapsed();
+    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "{human_phase} ({secs:.3}s since start)"
+    )
+}
+
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
@@ -207,26 +257,17 @@ fn start_pageserver(
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();

-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
    // A changed launch timestamp indicates a pageserver restart.
    info!(
-        "version: {} launch_timestamp: {}",
+        "version: {} launch_timestamp: {} build_tag: {}",
        version(),
-        launch_ts.to_string()
+        launch_ts.to_string(),
+        BUILD_TAG,
    );
-    set_build_info_metric(GIT_VERSION);
+    set_build_info_metric(GIT_VERSION, BUILD_TAG);
    set_launch_timestamp_metric(launch_ts);
    pageserver::preinitialize_metrics();

@@ -283,13 +324,12 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -341,7 +381,7 @@ fn start_pageserver(

    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
+    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
@@ -361,24 +401,18 @@ fn start_pageserver(
    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();

    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
-    let (tenants_can_start, tenants_can_start_barrier) = utils::completion::channel();
-
-    tracing::info!("init_remote_done_tx:");
-    let c = init_remote_done_tx.clone();
-    drop(c);

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
-        tenants_can_start: tenants_can_start_barrier.clone(),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -388,66 +422,100 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial load completed")
+            });

-            init_remote_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed");
+            let timeout = conf.background_task_maximum_delay;

-            drop(tenants_can_start);
+            let init_remote_done = std::pin::pin!(async {
+                init_remote_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load_remote",
+                    "Remote part of initial load completed",
+                );
+            });

-            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_remote_skipped,
+            } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
+
+            let init_load_done = std::pin::pin!(async {
+                init_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load",
+                    "Initial load completed",
+                );
+                STARTUP_IS_LOADING.set(0);
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_load_skipped,
+            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });

-            let timeout = conf.background_task_maximum_delay;
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });

-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-
-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
-                    None
-                }
-                Err(_) => {
-                    tracing::info!(
-                        timeout_millis = timeout.as_millis(),
-                        "Initial logical size timeout elapsed; starting background jobs"
-                    );
-                    Some(init_sizes_done)
-                }
-            };
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;

            scopeguard::ScopeGuard::into_inner(guard);

-            // allow background jobs to start
+            // allow background jobs to start: we either completed prior stages, or they reached timeout
+            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
+            // because things like consumption metrics for billing are blocked by this barrier.
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");
-
-            if let Some(init_sizes_done) = init_sizes_done {
-                // ending up here is not a bug; at the latest logical sizes will be queried by
-                // consumption metrics.
-                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-                init_sizes_done.await;
-
-                scopeguard::ScopeGuard::into_inner(guard);
-
-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+            startup_checkpoint(
+                started_startup_at,
+                "background_jobs_can_start",
+                "Starting background jobs",
+            );

+            // We are done. If we skipped any phases due to timeout, run them to completion here so that
+            // they will eventually update their startup_checkpoint, and so that we do not declare the
+            // 'complete' stage until all the other stages are really done.
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before waiting for skipped phases done")
+            });
+            if let Some(f) = init_remote_skipped {
+                f.await;
            }
+            if let Some(f) = init_load_skipped {
+                f.await;
+            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
+            scopeguard::ScopeGuard::into_inner(guard);

-            startup_checkpoint("complete", "Startup complete");
+            startup_checkpoint(started_startup_at, "complete", "Startup complete");
        };

        async move {
@@ -483,6 +551,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs: NonZeroUsize::new(
-                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
-                    )
-                        .unwrap(),
-                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
-                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs,
-                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -11,6 +11,7 @@ use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
+use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -88,22 +89,12 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        if metrics.is_empty() {
-            continue;
-        }
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -142,10 +133,19 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            metric_collection_interval,
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }

@@ -244,16 +244,14 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
+    scopeguard::defer! {
+        info!("calculate_synthetic_size_worker stopped");
+    };

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -268,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
@@ -281,9 +279,18 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            synthetic_size_calculation_interval,
            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + synthetic_size_calculation_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -3,7 +3,6 @@ use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use serde_with::serde_as;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -42,13 +41,10 @@ pub(super) enum Name {
 ///
 /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
 /// elsewhere.
-#[serde_with::serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(crate) struct MetricsKey {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,

-    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,

@@ -206,7 +202,6 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
-                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,5 +1,4 @@
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
-use serde_with::serde_as;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -7,12 +6,9 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
 use utils::id::{TenantId, TimelineId};

 /// How the metrics from pageserver are identified.
-#[serde_with::serde_as]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
-    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,7 +57,10 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            headers.insert(
+                "Authorization",
+                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
+            );
            client = client.default_headers(headers);
        }

@@ -144,7 +147,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.generation)))
+            .map(|t| (t.id, Generation::new(t.gen)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -17,7 +18,6 @@ use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
-use serde_with::serde_as;
 use thiserror::Error;
 use tokio;
 use tokio_util::sync::CancellationToken;
@@ -214,7 +214,6 @@ where
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

-#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionList {
    /// Serialization version, for future use
@@ -243,7 +242,6 @@ struct DeletionList {
    validated: bool,
 }

-#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionHeader {
    /// Serialization version, for future use
@@ -271,7 +269,9 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .map_err(Into::into)
+            .maybe_fatal_err("save deletion header")?;
+
+        Ok(())
    }
 }

@@ -360,6 +360,7 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
+            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
@@ -892,14 +893,6 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
            |_| false,
            3,
            10,
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -195,7 +197,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    Err(anyhow::anyhow!(e))
+                    on_fatal_io_error(&e, "reading deletion header");
                }
            }
        }
@@ -216,16 +218,9 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
+        let mut dir = tokio::fs::read_dir(&deletion_directory)
+            .await
+            .fatal_err("read deletion directory");

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -233,7 +228,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
+        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -246,11 +241,9 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
-                    // Non-fatal error: we will just leave the file behind but not
-                    // try and load it.
-                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
-                }
+                tokio::fs::remove_file(&absolute_path)
+                    .await
+                    .fatal_err("delete temp file");

                continue;
            }
@@ -290,7 +283,9 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path).await?;
+            let list_bytes = tokio::fs::read(&list_path)
+                .await
+                .fatal_err("read deletion list");

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
@@ -349,7 +344,7 @@ impl ListWriter {
        info!("Started deletion frontend worker");

        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+        if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
+use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -287,16 +288,9 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files.  We will leave the file behind.  Subsequent
-                // pageservers will try and load it again: hopefully whatever storage
-                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {list_path}: {e:#}");
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                break;
-            }
+            tokio::fs::remove_file(&list_path)
+                .await
+                .fatal_err("remove deletion list");
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        self,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -273,7 +270,6 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -330,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -349,10 +346,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
+        // tasks to evict all seen layers until we have evicted enough
+
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -369,69 +374,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                                warn!(%layer, "failed to evict layer: {detail}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -446,7 +483,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

@@ -508,7 +545,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -517,6 +554,11 @@ async fn collect_eviction_candidates(
            }
        };

+        if tenant.cancel.is_cancelled() {
+            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
+            continue;
+        }
+
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,6 +52,31 @@ paths:
              schema:
                type: object

+  /v1/reload_auth_validation_keys:
+    post:
+      description: Reloads the JWT public keys from their pre-configured location on disk.
+      responses:
+        "200":
+          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -327,7 +352,8 @@ paths:
          in: query
          required: true
          schema:
-            type: integer
+            type: string
+            format: hex
          description: A LSN to get the timestamp
      responses:
        "200":
@@ -392,13 +418,19 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
+        - name: version
+          in: query
+          required: false
+          schema:
+            type: integer
+          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                type: string
+                $ref: "#/components/schemas/LsnByTimestampResponse"
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -563,7 +595,17 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: Tenant download is already in progress
+          description: |
+            The tenant is already known to Pageserver in some way,
+            and hence this `/attach` call has been rejected.
+
+            Some examples of how this can happen:
+            - tenant was created on this pageserver
+            - tenant attachment was started by an earlier call to `/attach`.
+
+            Callers should poll the tenant status's `attachment_status` field,
+            like for status 202. See the longer description for `POST /attach`
+            for details.
          content:
            application/json:
              schema:
@@ -707,6 +749,12 @@ paths:

        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantLoadRequest"
      responses:
        "202":
          description: Tenant scheduled to load successfully
@@ -1197,6 +1245,15 @@ components:
            new_tenant_id:
              type: string
              format: hex
+            generation:
+              type: integer
+              description: Attachment generation number.
+    TenantLoadRequest:
+      type: object
+      properties:
+        generation:
+          type: integer
+          description: Attachment generation number.
    TenantAttachRequest:
      type: object
      required:
@@ -1384,6 +1441,19 @@ components:
          type: string
          format: hex

+    LsnByTimestampResponse:
+      type: object
+      required:
+        - lsn
+        - kind
+      properties:
+        lsn:
+          type: string
+          format: hex
+        kind:
+          type: string
+          enum: [past, present, future, nodata]
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,7 +8,7 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
-use hyper::header::CONTENT_TYPE;
+use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -35,7 +36,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -44,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -62,7 +64,8 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    auth: Option<Arc<JwtAuth>>,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -73,7 +76,8 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        auth: Option<Arc<JwtAuth>>,
+        tenant_manager: Arc<TenantManager>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -85,6 +89,7 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
+            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -146,28 +151,59 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{tmie}").into())
-            }
-            TenantMapInsertError::TenantAlreadyExists(id, state) => {
-                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
-            }
-            TenantMapInsertError::TenantExistsSecondary(id) => {
-                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
-            }
+            TenantMapInsertError::SlotError(e) => e.into(),
+            TenantMapInsertError::SlotUpsertError(e) => e.into(),
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

+impl From<TenantSlotError> for ApiError {
+    fn from(e: TenantSlotError) -> ApiError {
+        use TenantSlotError::*;
+        match e {
+            NotFound(tenant_id) => {
+                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
+            }
+            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
+            InProgress => {
+                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
+            }
+            MapState(e) => e.into(),
+        }
+    }
+}
+
+impl From<TenantSlotUpsertError> for ApiError {
+    fn from(e: TenantSlotUpsertError) -> ApiError {
+        use TenantSlotUpsertError::*;
+        match e {
+            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
+            MapState(e) => e.into(),
+        }
+    }
+}
+
+impl From<TenantMapError> for ApiError {
+    fn from(e: TenantMapError) -> ApiError {
+        use TenantMapError::*;
+        match e {
+            StillInitializing | ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{e}").into())
+            }
+        }
+    }
+}
+
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
+            TenantStateError::SlotError(e) => e.into(),
+            TenantStateError::SlotUpsertError(e) => e.into(),
+            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
        }
    }
 }
@@ -188,6 +224,7 @@ impl From<GetTenantError> for ApiError {
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
+            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
        }
    }
 }
@@ -242,6 +279,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
+            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
+            SlotError(e) => e.into(),
+            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -263,11 +303,7 @@ async fn build_timeline_info(
        // we're executing this function, we will outlive the timeline on-disk state.
        info.current_logical_size_non_incremental = Some(
            timeline
-                .get_current_logical_size_non_incremental(
-                    info.last_record_lsn,
-                    CancellationToken::new(),
-                    ctx,
-                )
+                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
                .await?,
        );
    }
@@ -353,6 +389,32 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

+async fn reload_auth_validation_keys_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let config = get_config(&request);
+    let state = get_state(&request);
+    let Some(shared_auth) = &state.auth else {
+        return json_response(StatusCode::BAD_REQUEST, ());
+    };
+    // unwrap is ok because check is performed when creating config, so path is set and exists
+    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
+    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
+
+    match JwtAuth::from_key_path(key_path) {
+        Ok(new_auth) => {
+            shared_auth.swap(new_auth);
+            json_response(StatusCode::OK, ())
+        }
+        Err(e) => {
+            warn!("Error reloading public keys from {key_path:?}: {e:}");
+            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+        }
+    }
+}
+
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -368,7 +430,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -396,6 +458,9 @@ async fn timeline_create_handler(
            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
            }
+            Err(tenant::CreateTimelineError::ShuttingDown) => {
+                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
+            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -415,7 +480,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -454,7 +519,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -484,6 +549,8 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let version: Option<u8> = parse_query_param(&request, "version")?;
+
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -495,13 +562,30 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    let result = match result {
-        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-        LsnForTimestamp::Future(_lsn) => "future".into(),
-        LsnForTimestamp::Past(_lsn) => "past".into(),
-        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-    };
-    json_response(StatusCode::OK, result)
+    if version.unwrap_or(0) > 1 {
+        #[derive(serde::Serialize)]
+        struct Result {
+            lsn: Lsn,
+            kind: &'static str,
+        }
+        let (lsn, kind) = match result {
+            LsnForTimestamp::Present(lsn) => (lsn, "present"),
+            LsnForTimestamp::Future(lsn) => (lsn, "future"),
+            LsnForTimestamp::Past(lsn) => (lsn, "past"),
+            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+        };
+        json_response(StatusCode::OK, Result { lsn, kind })
+    } else {
+        // FIXME: this is a temporary crutch not to break backwards compatibility
+        // See https://github.com/neondatabase/neon/pull/5608
+        let result = match result {
+            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+            LsnForTimestamp::Future(_lsn) => "future".into(),
+            LsnForTimestamp::Past(_lsn) => "past".into(),
+            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+        };
+        json_response(StatusCode::OK, result)
+    }
 }

 async fn get_timestamp_of_lsn_handler(
@@ -520,7 +604,7 @@ async fn get_timestamp_of_lsn_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
+    let result = timeline.get_max_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
        Some(time) => {
@@ -691,7 +775,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -754,7 +838,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;

    // this can be long operation
    let inputs = tenant
@@ -767,6 +851,10 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
+    let accepts_html = headers
+        .get(header::ACCEPT)
+        .map(|v| v == "text/html")
+        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -774,21 +862,19 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if headers["Accept"] == "text/html" {
+        if accepts_html {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if headers["Accept"] == "text/html" {
+    } else if accepts_html {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
    }

    /// The type resides in the pageserver not to expose `ModelInputs`.
-    #[serde_with::serde_as]
    #[derive(serde::Serialize)]
    struct TenantHistorySize {
-        #[serde_as(as = "serde_with::DisplayFromStr")]
        id: TenantId,
        /// Size is a mixture of WAL and logical size, so the unit is bytes.
        ///
@@ -929,7 +1015,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .header(header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1009,7 +1095,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false).await?;
+    let tenant = mgr::get_tenant(tenant_id, false)?;

    let response = HashMap::from([
        (
@@ -1068,7 +1154,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::NotFound(_) => {
+                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1080,20 +1166,14 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
+    state
+        .tenant_manager
+        .upsert_location(tenant_id, location_conf, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1106,7 +1186,6 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
-        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1179,7 +1258,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1204,7 +1283,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1310,7 +1389,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(header::CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1399,7 +1478,7 @@ async fn timeline_collect_keyspace(
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
@@ -1411,7 +1490,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1474,11 +1553,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

@@ -1496,7 +1575,6 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
-                &storage,
                usage,
                &child_cancel,
            )
@@ -1589,6 +1667,8 @@ where
        );

        match handle.await {
+            // TODO: never actually return Err from here, always Ok(...) so that we can log
+            // spanned errors. Call api_error_handler instead and return appropriate Body.
            Ok(result) => result,
            Err(e) => {
                // The handler task panicked. We have a global panic handler that logs the
@@ -1637,7 +1717,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1666,6 +1746,9 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .post("/v1/reload_auth_validation_keys", |r| {
+            api_handler(r, reload_auth_validation_keys_handler)
+        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 mod auth;
 pub mod basebackup;
 pub mod config;
@@ -61,14 +63,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

-    // Shut down any page service tasks.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
-        "shutdown PageRequestHandlers",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
@@ -78,6 +72,15 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

+    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
+    // should already have been canclled via mgr::shutdown_all_tenants
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        "shutdown PageRequestHandlers",
+        Duration::from_secs(1),
+    )
+    .await;
+
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    if let Some(mut deletion_queue) = deletion_queue {
        deletion_queue.shutdown(Duration::from_secs(5)).await;
@@ -149,6 +152,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

+// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
+// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
+// from the name.
+
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
@@ -186,8 +193,6 @@ pub struct InitializationOrder {
    /// attempt. It is important to drop this once the attempt has completed.
    pub initial_logical_size_attempt: Option<utils::completion::Completion>,

-    pub tenants_can_start: utils::completion::Barrier,
-
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -962,6 +962,32 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });

+pub(crate) struct TenantManagerMetrics {
+    pub(crate) tenant_slots: UIntGauge,
+    pub(crate) tenant_slot_writes: IntCounter,
+    pub(crate) unexpected_errors: IntCounter,
+}
+
+pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
+    TenantManagerMetrics {
+    tenant_slots: register_uint_gauge!(
+        "pageserver_tenant_manager_slots",
+        "How many slots currently exist, including all attached, secondary and in-progress operations",
+    )
+    .expect("failed to define a metric"),
+    tenant_slot_writes: register_int_counter!(
+        "pageserver_tenant_manager_slot_writes",
+        "Writes to a tenant slot, including all of create/attach/detach/delete"
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_tenant_manager_unexpected_errors_total",
+        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
+    )
+    .expect("failed to define a metric"),
+}
+});
+
 pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
@@ -1199,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the Postgres WAL redo process",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
@@ -1235,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

+pub(crate) struct WalRedoProcessCounters {
+    pub(crate) started: IntCounter,
+    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+}
+
+#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub(crate) enum WalRedoKillCause {
+    WalRedoProcessDrop,
+    NoLeakChildDrop,
+    Startup,
+}
+
+impl Default for WalRedoProcessCounters {
+    fn default() -> Self {
+        let started = register_int_counter!(
+            "pageserver_wal_redo_process_started_total",
+            "Number of WAL redo processes started",
+        )
+        .unwrap();
+
+        let killed = register_int_counter_vec!(
+            "pageserver_wal_redo_process_stopped_total",
+            "Number of WAL redo processes stopped",
+            &["cause"],
+        )
+        .unwrap();
+        Self {
+            started,
+            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
+                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause_str: &'static str = cause.into();
+                killed.with_label_values(&[cause_str])
+            })),
+        }
+    }
+}
+
+pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
+    Lazy::new(WalRedoProcessCounters::default);
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -1388,28 +1445,23 @@ impl TimelineMetrics {
        }
    }

-    pub fn record_new_file_metrics(&self, sz: u64) {
+    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub fn resident_physical_size_sub(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub fn resident_physical_size_add(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub fn resident_physical_size_set(&self, sz: u64) {
-        self.resident_physical_size_gauge.set(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
-    }
-
-    pub fn resident_physical_size_get(&self) -> u64 {
+    pub(crate) fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
@@ -1889,6 +1941,9 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

+    // Tenant manager stats
+    Lazy::force(&TENANT_MANAGER);
+
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
@@ -1904,7 +1959,6 @@ pub fn preinitialize_metrics() {
        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
    ]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -40,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, JwtAuth, Scope},
+    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -55,16 +55,20 @@ use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
-use crate::tenant::mgr::GetTenantError;
-use crate::tenant::{Tenant, Timeline};
+use crate::tenant::mgr::get_active_tenant_with_timeout;
+use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::Timeline;
 use crate::trace::Tracer;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

+// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// is not yet in state [`TenantState::Active`].
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -118,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -186,7 +190,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -214,22 +218,34 @@ async fn page_service_conn_main(
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);

-    // timeout should be lower, but trying out multiple days for
-    // <https://github.com/neondatabase/neon/issues/4205>
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
+    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
+    let socket_timeout_ms = (|| {
+        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
+            // Exponential distribution for simulating
+            // poor network conditions, expect about avg_timeout_ms to be around 15
+            // in tests
+            if let Some(avg_timeout_ms) = avg_timeout_ms {
+                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
+                let u = rand::random::<f32>();
+                ((1.0 - u).ln() / (-avg)) as u64
+            } else {
+                default_timeout_ms
+            }
+        });
+        default_timeout_ms
+    })();
+
+    // A timeout here does not mean the client died, it can happen if it's just idle for
+    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
+    // they reconnect.
+    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(
-        conf,
-        broker_client,
-        auth,
-        connection_ctx,
-        task_mgr::shutdown_token(),
-    );
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -255,7 +271,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -263,19 +279,14 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
-
-    /// A token that should fire when the tenant transitions from
-    /// attached state, or when the pageserver is shutting down.
-    cancel: CancellationToken,
 }

 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<JwtAuth>>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
-        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -283,7 +294,6 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
-            cancel,
        }
    }

@@ -291,7 +301,11 @@ impl PageServerHandler {
    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
    /// in the flush.
-    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
+    async fn flush_cancellable<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
@@ -299,7 +313,7 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
-            _ = self.cancel.cancelled() => {
+            _ = cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
        )
@@ -308,6 +322,7 @@ impl PageServerHandler {
    fn copyin_stream<'a, IO>(
        &'a self,
        pgb: &'a mut PostgresBackend<IO>,
+        cancel: &'a CancellationToken,
    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -317,7 +332,7 @@ impl PageServerHandler {
                let msg = tokio::select! {
                    biased;

-                    _ = self.cancel.cancelled() => {
+                    _ = cancel.cancelled() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
@@ -357,7 +372,7 @@ impl PageServerHandler {
                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
                        // error can't happen here, ErrorResponse serialization should be always ok
                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                    }
                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
@@ -384,12 +399,13 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // NOTE: pagerequests handler exits when connection is closed,
-        //       so there is no need to reset the association
-        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
-
        // Make request tracer if needed
-        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let tenant = mgr::get_active_tenant_with_timeout(
+            tenant_id,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await?;
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path = tenant
@@ -405,9 +421,14 @@ impl PageServerHandler {
            .get_timeline(timeline_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;

+        // Avoid starting new requests if the timeline has already started shutting down,
+        // and block timeline shutdown until this request is complete, or drops out due
+        // to cancellation.
+        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb).await?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -415,7 +436,7 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = self.cancel.cancelled() => {
+                _ = timeline.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -490,9 +511,24 @@ impl PageServerHandler {
                }
            };

+            if let Err(e) = &response {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
+                    // If we fail to fulfil a request during shutdown, which may be _because_ of
+                    // shutdown, then do not send the error to the client.  Instead just drop the
+                    // connection.
+                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
+                    return Err(QueryError::Shutdown);
+                }
+            }
+
            let response = response.unwrap_or_else(|e| {
                // print the all details to the log with {:#}, but for the client the
-                // error message is enough
+                // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                // here includes cancellation which is not an error.
                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
                PagestreamBeMessage::Error(PagestreamErrorResponse {
                    message: e.to_string(),
@@ -500,7 +536,7 @@ impl PageServerHandler {
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb).await?;
+            self.flush_cancellable(pgb, &timeline.cancel).await?;
        }
        Ok(())
    }
@@ -522,10 +558,14 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let tenant = get_active_tenant_with_timeout(
+            tenant_id,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -543,9 +583,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb).await?;
+        self.flush_cancellable(pgb, &tenant.cancel).await?;

-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -582,9 +622,10 @@ impl PageServerHandler {
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
-        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

-        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
@@ -598,8 +639,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
+        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -792,7 +833,9 @@ impl PageServerHandler {
        let started = std::time::Instant::now();

        // check that the timeline exists
-        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -807,7 +850,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        self.flush_cancellable(pgb).await?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -859,7 +902,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        self.flush_cancellable(pgb).await?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;

        let basebackup_after = started
            .elapsed()
@@ -877,7 +920,7 @@ impl PageServerHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -889,7 +932,26 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id)
+        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
+    }
+
+    /// Shorthand for getting a reference to a Timeline of an Active tenant.
+    async fn get_active_tenant_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+        let tenant = get_active_tenant_with_timeout(
+            tenant_id,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        Ok(timeline)
    }
 }

@@ -909,16 +971,17 @@ where
            .auth
            .as_ref()
            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .map_err(|e| QueryError::Unauthorized(e.0))?;

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing"
-            )));
+            return Err(QueryError::Unauthorized(
+                "jwt token scope is Tenant, but tenant id is missing".into(),
+            ));
        }

-        info!(
-            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -940,9 +1003,13 @@ where
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
+        fail::fail_point!("simulated-bad-compute-connection", |_| {
+            info!("Hit failpoint for bad connection");
+            Err(QueryError::SimulatedConnectionError)
+        });
+
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1048,7 +1115,9 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+            let timeline = self
+                .get_active_tenant_timeline(tenant_id, timeline_id)
+                .await?;

            let end_of_timeline = timeline.get_last_record_rlsn();

@@ -1232,7 +1301,12 @@ where

            self.check_permission(Some(tenant_id))?;

-            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+            let tenant = get_active_tenant_with_timeout(
+                tenant_id,
+                ACTIVE_TENANT_TIMEOUT,
+                &task_mgr::shutdown_token(),
+            )
+            .await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1278,67 +1352,16 @@ where
    }
 }

-#[derive(thiserror::Error, Debug)]
-enum GetActiveTenantError {
-    #[error(
-        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
-    )]
-    WaitForActiveTimeout {
-        latest_state: TenantState,
-        wait_time: Duration,
-    },
-    #[error(transparent)]
-    NotFound(GetTenantError),
-    #[error(transparent)]
-    WaitTenantActive(tenant::WaitToBecomeActiveError),
-}
-
 impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
-            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
-        }
-    }
-}
-
-/// Get active tenant.
-///
-/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
-/// ensures that queries don't fail immediately after pageserver startup, because
-/// all tenants are still loading.
-async fn get_active_tenant_with_timeout(
-    tenant_id: TenantId,
-    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
-) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    let tenant = match mgr::get_tenant(tenant_id, false).await {
-        Ok(tenant) => tenant,
-        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
-        Err(GetTenantError::NotActive(_)) => {
-            unreachable!("we're calling get_tenant with active_only=false")
-        }
-        Err(GetTenantError::Broken(_)) => {
-            unreachable!("we're calling get_tenant with active_only=false")
-        }
-    };
-    let wait_time = Duration::from_secs(30);
-    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
-        Ok(Ok(())) => Ok(tenant),
-        // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
-        Err(_) => {
-            let latest_state = tenant.current_state();
-            if latest_state == TenantState::Active {
-                Ok(tenant)
-            } else {
-                Err(GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state,
-                    wait_time,
-                })
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                QueryError::Shutdown
            }
+            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
 }
@@ -1359,18 +1382,3 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
-
-/// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_tenant_timeline(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
-    Ok(timeline)
-}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,8 +21,9 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
+use utils::bin_ser::DeserializeError;
+use utils::exp_counter::ExpCounter;
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
@@ -30,9 +31,33 @@ pub type BlockNumber = u32;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
+    /// Found commits both before and after the given timestamp
    Present(Lsn),
+
+    /// Found no commits after the given timestamp, this means
+    /// that the newest data in the branch is older than the given
+    /// timestamp.
+    ///
+    /// All commits <= LSN happened before the given timestamp
    Future(Lsn),
+
+    /// The queried timestamp is past our horizon we look back at (PITR)
+    ///
+    /// All commits > LSN happened after the given timestamp,
+    /// but any commits < LSN might have happened before or after
+    /// the given timestamp. We don't know because no data before
+    /// the given lsn is available.
    Past(Lsn),
+
+    /// We have found no commit with a timestamp,
+    /// so we can't return anything meaningful.
+    ///
+    /// The associated LSN is the lower bound value we can safely
+    /// create branches on, but no statement is made if it is
+    /// older or newer than the timestamp.
+    ///
+    /// This variant can e.g. be returned right after a
+    /// cluster import.
    NoData(Lsn),
 }

@@ -44,6 +69,36 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum CollectKeySpaceError {
+    #[error(transparent)]
+    Decode(#[from] DeserializeError),
+    #[error(transparent)]
+    PageRead(PageReconstructError),
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl From<PageReconstructError> for CollectKeySpaceError {
+    fn from(err: PageReconstructError) -> Self {
+        match err {
+            PageReconstructError::Cancelled => Self::Cancelled,
+            err => Self::PageRead(err),
+        }
+    }
+}
+
+impl From<PageReconstructError> for CalculateLogicalSizeError {
+    fn from(pre: PageReconstructError) -> Self {
+        match pre {
+            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
+                Self::Cancelled
+            }
+            _ => Self::Other(pre.into()),
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
    #[error("Relation Already Exists")]
@@ -314,7 +369,11 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
-        let min_lsn = *gc_cutoff_lsn_guard;
+        // We use this method to figure out the branching LSN for the new branch, but the
+        // GC cutoff could be before the branching point and we cannot create a new branch
+        // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
+        // on the safe side.
+        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
        let max_lsn = self.get_last_record_lsn();

        // LSNs are always 8-byte aligned. low/mid/high represent the
@@ -324,83 +383,92 @@ impl Timeline {

        let mut found_smaller = false;
        let mut found_larger = false;
+
+        // This is a search budget to not make the search too expensive
+        let mut budget = 1_000_000u32;
        while low < high {
            // cannot overflow, high and low are both smaller than u64::MAX / 2
-            let mid = (high + low) / 2;
+            // this always holds: low <= mid_start < high
+            let mid_start = (high + low) / 2;
+            let mut mid = mid_start;

-            let cmp = self
-                .is_latest_commit_timestamp_ge_than(
-                    search_timestamp,
-                    Lsn(mid * 8),
-                    &mut found_smaller,
-                    &mut found_larger,
-                    ctx,
-                )
-                .await?;
+            let mut max = None;

-            if cmp {
-                high = mid;
+            // Search for an lsn that has a commit timestamp.
+            // The search with `ExpCounter` will eventually try all LSNs
+            // in the range between mid_start and high, which is important
+            // when we set high to mid if we have exhausted all possibilities.
+            // We don't do an explosive search as we want to prove that
+            // every single lsn up to high is giving inconclusive results.
+            // This can only be done by trying all lsns.
+            for offs in ExpCounter::with_max_and_linear_search(high - mid_start) {
+                mid = offs + mid_start;
+
+                // Do the query for mid + 1 instead of mid so that we can make definite statements
+                // about low (low's invariant: always points to commit before or at search_timestamp).
+                max = self
+                    .get_max_timestamp_for_lsn(Lsn((mid + 1) * 8), ctx)
+                    .await?;
+                if max.is_some() {
+                    break;
+                }
+                // Do some limiting to make sure the query does not become too expensive.
+                if let Some(new_budget) = budget.checked_sub(1) {
+                    budget = new_budget;
+                } else {
+                    return Ok(LsnForTimestamp::NoData(min_lsn));
+                }
+            }
+
+            if let Some(max) = max {
+                if max <= search_timestamp {
+                    // We found a commit with timestamp before (or at) `search_timestamp`.
+                    found_smaller = true;
+                    low = mid + 1;
+                } else {
+                    // We found a commit with timestamp after `search_timestamp`.
+                    found_larger = true;
+                    high = mid;
+                }
            } else {
-                low = mid + 1;
+                // max is None so we have proof of a chain of None's from mid_start all up to high.
+                // Thus we can safely set high to mid_start
+                high = mid_start;
            }
        }
+        // If `found_smaller == true`, `low` is the LSN of the last commit record
+        // before or at `search_timestamp`
+        //
+        // FIXME: it would be better to get the LSN of the previous commit.
+        // Otherwise, if you restore to the returned LSN, the database will
+        // include physical changes from later commits that will be marked
+        // as aborted, and will need to be vacuumed away.
+        let commit_lsn = Lsn(low * 8);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
                // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
-            }
-            (true, false) => {
-                // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                Ok(LsnForTimestamp::NoData(min_lsn))
            }
            (false, true) => {
                // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                Ok(LsnForTimestamp::Past(min_lsn))
            }
-            (true, true) => {
-                // low is the LSN of the first commit record *after* the search_timestamp,
-                // Back off by one to get to the point just before the commit.
-                //
-                // FIXME: it would be better to get the LSN of the previous commit.
-                // Otherwise, if you restore to the returned LSN, the database will
-                // include physical changes from later commits that will be marked
-                // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+            (true, false) => {
+                // Only found commits with timestamps smaller than the request.
+                // It's still a valid case for branch creation, return it.
+                // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
+                // case, anyway.
+                Ok(LsnForTimestamp::Future(commit_lsn))
            }
+            (true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
        }
    }

-    /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
-    /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
+    /// Obtain the timestamp for the given lsn.
    ///
-    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
-    /// with a smaller/larger timestamp.
-    ///
-    pub async fn is_latest_commit_timestamp_ge_than(
-        &self,
-        search_timestamp: TimestampTz,
-        probe_lsn: Lsn,
-        found_smaller: &mut bool,
-        found_larger: &mut bool,
-        ctx: &RequestContext,
-    ) -> Result<bool, PageReconstructError> {
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
-            if timestamp >= search_timestamp {
-                *found_larger = true;
-                return ControlFlow::Break(true);
-            } else {
-                *found_smaller = true;
-            }
-            ControlFlow::Continue(())
-        })
-        .await
-    }
-
-    /// Obtain the possible timestamp range for the given lsn.
-    ///
-    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
+    /// If the lsn has no timestamps, returns None. Returns the maximum such timestamp otherwise.
+    pub(crate) async fn get_max_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
        ctx: &RequestContext,
@@ -552,7 +620,8 @@ impl Timeline {
                Err(e) => Err(PageReconstructError::from(e)),
            },
            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
                Ok(HashMap::new())
            }
        }
@@ -566,30 +635,22 @@ impl Timeline {
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
-        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, lsn, ctx)
-                .await
-                .context("list rels")?
-            {
-                if cancel.is_cancelled() {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self
-                    .get(relsize_key, lsn, ctx)
-                    .await
-                    .with_context(|| format!("read relation size of {rel:?}"))?;
+                let mut buf = self.get(relsize_key, lsn, ctx).await?;
                let relsize = buf.get_u32_le();

                total_size += relsize as u64;
@@ -602,11 +663,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    pub async fn collect_keyspace(
+    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> anyhow::Result<KeySpace> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -615,7 +676,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
+        let dbdir = DbDirectory::des(&buf)?;

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
        dbs.sort_unstable();
@@ -648,7 +709,7 @@ impl Timeline {
            let slrudir_key = slru_dir_to_key(kind);
            result.add_key(slrudir_key);
            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
+            let dir = SlruSegmentDirectory::des(&buf)?;
            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
            segments.sort_unstable();
            for segno in segments {
@@ -666,7 +727,7 @@ impl Timeline {
        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
        for xid in xids {
@@ -675,8 +736,9 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        result.add_key(AUX_FILES_KEY);
-
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }
        Ok(result.to_keyspace())
    }

@@ -1201,7 +1263,8 @@ impl<'a> DatadirModification<'a> {
        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
            Ok(buf) => AuxFilesDirectory::des(&buf)?,
            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
                AuxFilesDirectory {
                    files: HashMap::new(),
                }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -299,10 +299,6 @@ pub enum TaskKind {

 #[derive(Default)]
 struct MutableTaskState {
-    /// Tenant and timeline that this task is associated with.
-    tenant_id: Option<TenantId>,
-    timeline_id: Option<TimelineId>,
-
    /// Handle for waiting for the task to exit. It can be None, if the
    /// the task has already exited.
    join_handle: Option<JoinHandle<()>>,
@@ -319,6 +315,11 @@ struct PageServerTask {
    // To request task shutdown, just cancel this token.
    cancel: CancellationToken,

+    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
+    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
+    tenant_id: Option<TenantId>,
+    timeline_id: Option<TimelineId>,
+
    mutable: Mutex<MutableTaskState>,
 }

@@ -344,11 +345,9 @@ where
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        mutable: Mutex::new(MutableTaskState {
-            tenant_id,
-            timeline_id,
-            join_handle: None,
-        }),
+        tenant_id,
+        timeline_id,
+        mutable: Mutex::new(MutableTaskState { join_handle: None }),
    });

    TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
@@ -418,8 +417,6 @@ async fn task_finish(

    let mut shutdown_process = false;
    {
-        let task_mut = task.mutable.lock().unwrap();
-
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
@@ -428,13 +425,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                }
            }
@@ -442,13 +439,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
+                        task_name, task.tenant_id, task.timeline_id, err
                    );
                }
            }
@@ -460,17 +457,6 @@ async fn task_finish(
    }
 }

-// expected to be called from the task of the given id.
-pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
-    CURRENT_TASK.with(|ct| {
-        let mut task_mut = ct.mutable.lock().unwrap();
-        task_mut.tenant_id = tenant_id;
-        task_mut.timeline_id = timeline_id;
-    });
-}
-
-/// Is there a task running that matches the criteria
-
 /// Signal and wait for tasks to shut down.
 ///
 ///
@@ -493,17 +479,16 @@ pub async fn shutdown_tasks(
    {
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
-            let task_mut = task.mutable.lock().unwrap();
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
-                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
+                && (tenant_id.is_none() || task.tenant_id == tenant_id)
+                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task_mut.tenant_id,
-                    task_mut.timeline_id,
+                    task.tenant_id,
+                    task.timeline_id,
                ));
            }
        }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -26,6 +26,7 @@ use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
+use utils::sync::gate::Gate;

 use std::cmp::min;
 use std::collections::hash_map::Entry;
@@ -54,6 +55,8 @@ use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::GetActiveTenantError;
+use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -137,9 +140,7 @@ pub(crate) mod timeline;
 pub mod size;

 pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-pub use timeline::{
-    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
-};
+pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;
@@ -254,6 +255,20 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
+
+    // Cancellation token fires when we have entered shutdown().  This is a parent of
+    // Timelines' cancellation token.
+    pub(crate) cancel: CancellationToken,
+
+    // Users of the Tenant such as the page service must take this Gate to avoid
+    // trying to use a Tenant which is shutting down.
+    pub(crate) gate: Gate,
+}
+
+impl std::fmt::Debug for Tenant {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.tenant_id, self.current_state())
+    }
 }

 pub(crate) enum WalRedoManager {
@@ -361,34 +376,6 @@ impl Debug for SetStoppingError {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub(crate) enum WaitToBecomeActiveError {
-    WillNotBecomeActive {
-        tenant_id: TenantId,
-        state: TenantState,
-    },
-    TenantDropped {
-        tenant_id: TenantId,
-    },
-}
-
-impl std::fmt::Display for WaitToBecomeActiveError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
-                write!(
-                    f,
-                    "Tenant {} will not become active. Current state: {:?}",
-                    tenant_id, state
-                )
-            }
-            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
-                write!(f, "Tenant {tenant_id} will not become active (dropped)")
-            }
-        }
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("a timeline with the given ID already exists")]
@@ -397,6 +384,8 @@ pub enum CreateTimelineError {
    AncestorLsn(anyhow::Error),
    #[error("ancestor timeline is not active")]
    AncestorNotActive,
+    #[error("tenant shutting down")]
+    ShuttingDown,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -528,7 +517,7 @@ impl Tenant {
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        init_order: Option<InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -668,14 +657,6 @@ impl Tenant {
                    }
                }

-                let can_start =
-                    init_order.as_ref().map(|x| &x.tenants_can_start);
-                if let Some(b) = can_start {
-                    info!("Waiting for can_start...");
-                    b.clone().wait().await;
-                    info!("Ready for can_start.");
-                }
-
                match tenant_clone.attach(init_order, preload, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
@@ -710,11 +691,18 @@ impl Tenant {
            cancel.clone(),
        )
        .await?;
-        info!("found {} timelines", remote_timeline_ids.len());

        let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME);
+        info!(
+            "found {} timelines, deleting={}",
+            remote_timeline_ids.len(),
+            deleting
+        );
+
        for k in other_keys {
-            info!("found non timeline key {k}");
+            if k != TENANT_DELETED_MARKER_FILE_NAME {
+                warn!("Unexpected non timeline key {k}");
+            }
        }

        Ok(TenantPreload {
@@ -744,6 +732,7 @@ impl Tenant {
            Some(p) => p,
            None => {
                // Deprecated dev mode: load from local disk state instead of remote storage
+                // https://github.com/neondatabase/neon/issues/5624
                return self.load_local(init_order, ctx).await;
            }
        };
@@ -755,7 +744,6 @@ impl Tenant {

        let mut timelines_to_resume_deletions = vec![];

-        // Wait for all the download tasks to complete & collect results.
        let mut remote_index_and_client = HashMap::new();
        let mut timeline_ancestors = HashMap::new();
        let mut existent_timelines = HashSet::new();
@@ -766,7 +754,7 @@ impl Tenant {

            let index_part = match preload.index_part {
                Ok(i) => {
-                    debug!("successfully downloaded index part for timeline {timeline_id}");
+                    debug!("remote index part exists for timeline {timeline_id}");
                    i
                }
                Err(e) => {
@@ -841,6 +829,8 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }

+        // The local filesystem contents are a cache of what's in the remote IndexPart;
+        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

        crate::failpoint_support::sleep_millis_async!("attach-before-activate");
@@ -1037,8 +1027,8 @@ impl Tenant {
                    TimelineId::try_from(timeline_uninit_mark_file.file_stem())
                        .with_context(|| {
                            format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
-                        )
+                                "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
+                            )
                        })?;
                let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
                if let Err(e) =
@@ -1525,6 +1515,11 @@ impl Tenant {
            )));
        }

+        let _gate = self
+            .gate
+            .enter()
+            .map_err(|_| CreateTimelineError::ShuttingDown)?;
+
        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");

@@ -1626,6 +1621,11 @@ impl Tenant {
        pitr: Duration,
        ctx: &RequestContext,
    ) -> anyhow::Result<GcResult> {
+        // Don't start doing work during shutdown
+        if let TenantState::Stopping { .. } = self.current_state() {
+            return Ok(GcResult::default());
+        }
+
        // there is a global allowed_error for this
        anyhow::ensure!(
            self.is_active(),
@@ -1654,6 +1654,12 @@ impl Tenant {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        // Don't start doing work during shutdown
+        if let TenantState::Stopping { .. } = self.current_state() {
+            return Ok(());
+        }
+
+        // We should only be called once the tenant has activated.
        anyhow::ensure!(
            self.is_active(),
            "Cannot run compaction iteration on inactive tenant"
@@ -1798,6 +1804,7 @@ impl Tenant {
        freeze_and_flush: bool,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();
+
        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
@@ -1823,6 +1830,7 @@ impl Tenant {
            }
            Err(SetStoppingError::AlreadyStopping(other)) => {
                // give caller the option to wait for this this shutdown
+                info!("Tenant::shutdown: AlreadyStopping");
                return Err(other);
            }
        };
@@ -1833,9 +1841,16 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let span = Span::current();
-                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
+                js.spawn(async move {
+                    if freeze_and_flush {
+                        timeline.flush_and_shutdown().instrument(span).await
+                    } else {
+                        timeline.shutdown().instrument(span).await
+                    }
+                });
            })
        };
+        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
            match res {
                Ok(()) => {}
@@ -1845,12 +1860,21 @@ impl Tenant {
            }
        }

+        // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
+        // them to continue to do work during their shutdown methods, e.g. flushing data.
+        tracing::debug!("Cancelling CancellationToken");
+        self.cancel.cancel();
+
        // shutdown all tenant and timeline tasks: gc, compaction, page service
        // No new tasks will be started for this tenant because it's in `Stopping` state.
        //
        // this will additionally shutdown and await all timeline tasks.
+        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;

+        // Wait for any in-flight operations to complete
+        self.gate.close().await;
+
        Ok(())
    }

@@ -2011,7 +2035,7 @@ impl Tenant {
        self.state.subscribe()
    }

-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
@@ -2019,11 +2043,9 @@ impl Tenant {
                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError| {
-                            WaitToBecomeActiveError::TenantDropped {
-                                tenant_id: self.tenant_id,
-                            }
-                        },
+                        |_e: tokio::sync::watch::error::RecvError|
+                            // Tenant existed but was dropped: report it as non-existent
+                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id))
                    )?;
                }
                TenantState::Active { .. } => {
@@ -2031,10 +2053,7 @@ impl Tenant {
                }
                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
-                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
-                        tenant_id: self.tenant_id,
-                        state: current_state,
-                    });
+                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
            }
        }
@@ -2100,6 +2119,9 @@ where
 }

 impl Tenant {
+    pub fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.read().unwrap().tenant_conf
    }
@@ -2257,6 +2279,7 @@ impl Tenant {
            initial_logical_size_can_start.cloned(),
            initial_logical_size_attempt.cloned().flatten(),
            state,
+            self.cancel.child_token(),
        );

        Ok(timeline)
@@ -2346,6 +2369,8 @@ impl Tenant {
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
+            cancel: CancellationToken::default(),
+            gate: Gate::new(format!("Tenant<{tenant_id}>")),
        }
    }

@@ -3468,17 +3493,21 @@ pub(crate) mod harness {

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();

+    pub(crate) fn setup_logging() {
+        LOG_HANDLE.get_or_init(|| {
+            logging::init(
+                logging::LogFormat::Test,
+                // enable it in case the tests exercise code paths that use
+                // debug_assert_current_span_has_tenant_and_timeline_id
+                logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+            )
+            .expect("Failed to init test logging")
+        });
+    }
+
    impl TenantHarness {
        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
-            LOG_HANDLE.get_or_init(|| {
-                logging::init(
-                    logging::LogFormat::Test,
-                    // enable it in case in case the tests exercise code paths that use
-                    // debug_assert_current_span_has_tenant_and_timeline_id
-                    logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-                )
-                .expect("Failed to init test logging")
-            });
+            setup_logging();

            let repo_dir = PageServerConf::test_repo_dir(test_name);
            let _ = fs::remove_dir_all(&repo_dir);
@@ -3505,10 +3534,6 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -3678,7 +3703,7 @@ mod tests {
    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
-        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
+        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
@@ -3774,9 +3799,9 @@ mod tests {
        let writer = tline.writer().await;

        #[allow(non_snake_case)]
-        let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
+        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
        #[allow(non_snake_case)]
-        let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
+        let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();

        // Insert a value on the timeline
        writer
@@ -4169,6 +4194,7 @@ mod tests {

    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
+        use storage_layer::AsLayerDesc;
        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -4176,16 +4202,18 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        let layer_map = tline.layers.read().await;
-        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
+        let level0_deltas = layer_map
+            .layer_map()
+            .get_level0_deltas()?
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
+            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());

        for delta in level0_deltas {
-            let delta = layer_map.get_from_desc(&delta);
            // Ensure we are dumping a delta layer here
-            let delta = delta.downcast_delta_layer().unwrap();
-
-            delta.dump(false, &ctx).await.unwrap();
+            assert!(delta.layer_desc().is_delta);
            delta.dump(true, &ctx).await.unwrap();
        }

@@ -4219,11 +4247,7 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness
-            .try_load_local(&ctx)
-            .await
-            .err()
-            .expect("should fail");
+        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
@@ -4357,7 +4381,7 @@ mod tests {

        let mut keyspace = KeySpaceAccum::new();

-        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;
        for _ in 0..50 {
            for _ in 0..10000 {
@@ -4403,7 +4427,7 @@ mod tests {

        const NUM_KEYS: usize = 1000;

-        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

@@ -4484,7 +4508,7 @@ mod tests {

        const NUM_KEYS: usize = 1000;

-        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

@@ -4575,7 +4599,7 @@ mod tests {
        const NUM_KEYS: usize = 100;
        const NUM_TLINES: usize = 50;

-        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        // Track page mutation lsns across different timelines.
        let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];

@@ -4709,7 +4733,7 @@ mod tests {
            // Keeps uninit mark in place
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown(false)
+                .shutdown()
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                let mut sz: u16 = rng.gen();
                // Make 50% of the arrays small
                if rng.gen() {
-                    sz |= 63;
+                    sz &= 63;
                }
                random_array(sz.into())
            })
--- a/Show More
+++ b/Show More