pageserver: remove unused Tenant::deletion_progress

deflake: test_live_reconfig_get_evictions_low_residence_... (#5926 )
- disable extra tenant - disable compaction which could try to repartition while we assert Split from #5108.
2026-02-16 09:00:38 +00:00 · 2023-11-27 16:49:56 +00:00 · 2023-11-27 15:20:54 +02:00 · 2023-11-27 12:50:19 +00:00 · 2023-11-27 12:10:23 +00:00 · 2023-11-27 09:31:20 +00:00
174 changed files with 6874 additions and 2588 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,17 +1,3 @@
-# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
-# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
-# optimizations enabled by "opt-level=1" don't affect debuggability too much.
-#
-# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
-#
-[profile.dev.package."*"]
-# Set the default for dependencies in Development mode.
-opt-level = 3
-
-[profile.dev]
-# Turn on a small amount of optimization in Development mode.
-opt-level = 1
-
 [build]
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas


-## Tasks
- [ ]
+```[tasklist]
+### Tasks
+```


 ## Other related tasks and Epics
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -3,7 +3,7 @@
 **NB: this PR must be merged only by 'Create a merge commit'!**

 ### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
+- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
 - [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
 - [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?

--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,5 +1,7 @@
 self-hosted-runner:
  labels:
+    - arm64
+    - dev
    - gen3
    - large
    - small
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,10 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -585,10 +586,13 @@ jobs:
        id: upload-coverage-report-new
        env:
          BUCKET: neon-github-public-dev
+          # A differential coverage report is available only for PRs.
+          # (i.e. for pushes into main/release branches we have a regular coverage report)
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          BASELINE="$(git merge-base HEAD origin/main)"
          CURRENT="${COMMIT_SHA}"
+          BASELINE="$(git merge-base $BASE_SHA $CURRENT)"

          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info

@@ -723,6 +727,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -847,7 +852,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.19.0

    steps:
      - name: Checkout
@@ -869,8 +874,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
-            -enable-file-cache \
-            -cgroup-uid=postgres \
+            -spec=vm-image-spec.yaml \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -112,8 +115,182 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 6 * * 1'
  workflow_dispatch:

 jobs:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,6 +9,24 @@ refactoring, additional comments, and so forth. Let's try to raise the
 bar, and clean things up as we go. Try to leave code in a better shape
 than it was before.

+## Pre-commit hook
+
+We have a sample pre-commit hook in `pre-commit.py`.
+To set it up, run:
+
+```bash
+ln -s ../../pre-commit.py .git/hooks/pre-commit
+```
+
+This will run following checks on staged files before each commit:
+- `rustfmt`
+- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+
+There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
+and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
+
+If you want to skip the hook, run `git commit` with `--no-verify` option.
+
 ## Submitting changes

 1. Get at least one +1 on your PR before you push.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,8 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+arc-swap = "1.6"
+async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 azure_core = "0.16"
 azure_identity = "0.16"
 azure_storage = "0.16"
@@ -47,6 +48,7 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
+aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -65,7 +67,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -81,7 +83,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -134,6 +136,7 @@ strum_macros = "0.24"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
+task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
@@ -162,11 +165,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -203,7 +206,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ################# Binary contents sections

--- a/5
+++ b/5
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,3 +38,4 @@ toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
+bytes = "1.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,7 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
+use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,10 +74,18 @@ fn main() -> Result<()> {
    let pgbin_default = String::from("postgres");
    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);

-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
-    let ext_remote_storage = remote_ext_config.map(|x| {
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
-    });
+    let ext_remote_storage = matches
+        .get_one::<String>("remote-ext-config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
+        // use the default value for extension storage proxy gateway.
+        // Remove this once the control plane is updated to pass the gateway URL
+        .map(|conf| {
+            if conf.starts_with("http") {
+                conf.trim_end_matches('/')
+            } else {
+                "http://pg-ext-s3-gateway"
+            }
+        });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -198,7 +206,7 @@ fn main() -> Result<()> {
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
+        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -479,13 +487,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -25,7 +25,7 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
@@ -59,8 +59,8 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  the S3 bucket that we search for extensions in
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    /// the address of extension storage proxy gateway
+    pub ext_remote_storage: Option<String>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -698,6 +698,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;

        // 'Close' connection
@@ -710,8 +711,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -724,9 +729,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
@@ -738,6 +743,7 @@ impl ComputeNode {
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
+            handle_extension_neon(&mut client)?;
        }

        // 'Close' connection
@@ -951,12 +957,12 @@ LIMIT 100",
        real_ext_name: String,
        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
-            .ext_remote_storage
-            .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
-            )))?;
+        let ext_remote_storage =
+            self.ext_remote_storage
+                .as_ref()
+                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                    "Remote extensions storage is not configured",
+                )))?;

        let ext_archive_name = ext_path.object_name().expect("bad path");

@@ -1012,7 +1018,7 @@ LIMIT 100",
        let download_size = extension_server::download_extension(
            &real_ext_name,
            &ext_path,
-            remote_storage,
+            ext_remote_storage,
            &self.pgbin,
        )
        .await
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,18 +71,16 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Context;
 use anyhow::{self, Result};
+use anyhow::{bail, Context};
+use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
 use regex::Regex;
 use remote_storage::*;
-use serde_json;
-use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use reqwest::StatusCode;
 use std::path::Path;
 use std::str;
 use tar::Archive;
-use tokio::io::AsyncReadExt;
 use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
@@ -133,67 +131,36 @@ fn parse_pg_version(human_version: &str) -> &str {
    panic!("Unsuported postgres version {human_version}");
 }

-#[cfg(test)]
-mod tests {
-    use super::parse_pg_version;
-
-    #[test]
-    fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_unsupported_version() {
-        parse_pg_version("PostgreSQL 13.14");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_incorrect_version_format() {
-        parse_pg_version("PostgreSQL 14");
-    }
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_storage: &GenericRemoteStorage,
+    ext_remote_storage: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
-    let mut download_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut download_buffer)
-        .await?;
+
+    // TODO add retry logic
+    let download_buffer =
+        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+            Ok(buffer) => buffer,
+            Err(error_message) => {
+                return Err(anyhow::anyhow!(
+                    "error downloading extension {:?}: {:?}",
+                    ext_name,
+                    error_message
+                ));
+            }
+        };
+
    let download_size = download_buffer.len() as u64;
+    info!("Download size {:?}", download_size);
    // it's unclear whether it is more performant to decompress into memory or not
    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
-    let mut decompress_buffer = Vec::new();
-    decoder.read_to_end(&mut decompress_buffer)?;
-    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let decoder = Decoder::new(download_buffer.as_ref())?;
+    let mut archive = Archive::new(decoder);
+
    let unzip_dest = pgbin
        .strip_suffix("/bin/postgres")
        .expect("bad pgbin")
@@ -261,29 +228,69 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    }
 }

-// This function initializes the necessary structs to use remote storage
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    #[derive(Debug, serde::Deserialize)]
-    struct RemoteExtJson {
-        bucket: String,
-        region: String,
-        endpoint: Option<String>,
-        prefix: Option<String>,
-    }
-    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+// Do request to extension storage proxy, i.e.
+// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// using HHTP GET
+// and return the response body as bytes
+//
+async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    let config = S3Config {
-        bucket_name: remote_ext_json.bucket,
-        bucket_region: remote_ext_json.region,
-        prefix_in_bucket: remote_ext_json.prefix,
-        endpoint: remote_ext_json.endpoint,
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
-        max_keys_per_list_response: None,
-    };
-    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
-        storage: RemoteStorageKind::AwsS3(config),
-    };
-    GenericRemoteStorage::from_config(&config)
+    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+
+    let resp = reqwest::get(uri).await?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.bytes().await {
+            Ok(resp) => {
+                info!("Download extension {:?} completed successfully", ext_path);
+                Ok(resp)
+            }
+            Err(e) => bail!("could not deserialize remote extension response: {}", e),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
+        _ => bail!(
+            "unexpected remote extension response status code: {}",
+            resp.status()
+        ),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        // download extension files from S3 on demand
+        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -674,3 +674,30 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>

    Ok(())
 }
+
+/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
+#[instrument(skip_all)]
+pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
+    info!("handle extension neon");
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
+    client.simple_query(query)?;
+
+    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
+    info!("create neon extension with query: {}", query);
+    client.simple_query(query)?;
+
+    query = "ALTER EXTENSION neon SET SCHEMA neon";
+    info!("alter neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    // this will be a no-op if extension is already up to date,
+    // which may happen in two cases:
+    // - extension was just installed
+    // - extension was already installed and is up to date
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,6 +9,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    client: reqwest::blocking::Client,
 }

 const COMMAND: &str = "attachment_service";
@@ -24,6 +25,16 @@ pub struct AttachHookResponse {
    pub gen: Option<u32>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct InspectRequest {
+    pub tenant_id: TenantId,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct InspectResponse {
+    pub attachment: Option<(u32, NodeId)>,
+}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -42,6 +53,9 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
+            client: reqwest::blocking::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
        }
    }

@@ -84,16 +98,13 @@ impl AttachmentService {
            .unwrap()
            .join("attach-hook")
            .unwrap();
-        let client = reqwest::blocking::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");

        let request = AttachHookRequest {
            tenant_id,
            node_id: Some(pageserver_id),
        };

-        let response = client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }
@@ -101,4 +112,26 @@ impl AttachmentService {
        let response = response.json::<AttachHookResponse>()?;
        Ok(response.gen)
    }
+
+    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("inspect")
+            .unwrap();
+
+        let request = InspectRequest { tenant_id };
+
+        let response = self.client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<InspectResponse>()?;
+        Ok(response.attachment)
+    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -32,7 +32,9 @@ use pageserver_api::control_api::{
    ValidateResponseTenant,
 };

-use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
+use control_plane::attachment_service::{
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
+};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -255,12 +257,28 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    )
 }

+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let locked = state.write().await;
+    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
+
+    json_response(
+        StatusCode::OK,
+        InspectResponse {
+            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
+        },
+    )
+}
+
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
        .post("/re-attach", |r| request_span(r, handle_re_attach))
        .post("/validate", |r| request_span(r, handle_validate))
        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
 }

 #[tokio::main]
@@ -268,6 +286,7 @@ async fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
    )?;

    let args = Cli::parse();
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -11,13 +11,14 @@ use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
-use control_plane::pageserver::PageServerNode;
+use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
-    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
 use safekeeper_api::{
@@ -46,8 +47,8 @@ const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";

-fn default_conf() -> String {
-    format!(
+fn default_conf(num_pageservers: u16) -> String {
+    let mut template = format!(
        r#"
 # Default built-in configuration, defined in main.rs
 control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
@@ -55,21 +56,33 @@ control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[[pageservers]]
-id = {DEFAULT_PAGESERVER_ID}
-listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
-listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}

 "#,
-        trust_auth = AuthType::Trust,
-    )
+    );
+
+    for i in 0..num_pageservers {
+        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+
+        template += &format!(
+            r#"
+[[pageservers]]
+id = {pageserver_id}
+listen_pg_addr = '127.0.0.1:{pg_port}'
+listen_http_addr = '127.0.0.1:{http_port}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
+"#,
+            trust_auth = AuthType::Trust,
+        )
+    }
+
+    template
 }

 ///
@@ -295,6 +308,9 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }

 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
+    let num_pageservers = init_match
+        .get_one::<u16>("num-pageservers")
+        .expect("num-pageservers arg has a default");
    // Create config file
    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
@@ -306,7 +322,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        })?
    } else {
        // Built-in default config
-        default_conf()
+        default_conf(*num_pageservers)
    };

    let pg_version = init_match
@@ -320,6 +336,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

+    // Create remote storage location for default LocalFs remote storage
+    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+
    // Initialize pageserver, create initial tenant and timeline.
    for ps_conf in &env.pageservers {
        PageServerNode::from_env(&env, ps_conf)
@@ -433,6 +452,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            migrate_tenant(env, tenant_id, new_pageserver)?;
+            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -459,8 +487,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info =
-                pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
+            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id_opt,
+                None,
+                None,
+                Some(pg_version),
+            )?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -867,20 +902,20 @@ fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Res
    }
 }

+fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+    let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+        NodeId(id_str.parse().context("while parsing pageserver id")?)
+    } else {
+        DEFAULT_PAGESERVER_ID
+    };
+
+    Ok(PageServerNode::from_env(
+        env,
+        env.get_pageserver_conf(node_id)?,
+    ))
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
-        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
-            NodeId(id_str.parse().context("while parsing pageserver id")?)
-        } else {
-            DEFAULT_PAGESERVER_ID
-        };
-
-        Ok(PageServerNode::from_env(
-            env,
-            env.get_pageserver_conf(node_id)?,
-        ))
-    }
-
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            if let Err(e) = get_pageserver(env, subcommand_args)?
@@ -917,6 +952,20 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
            }
        }

+        Some(("migrate", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status() {
                Ok(_) => println!("Page server is up and running"),
@@ -1203,7 +1252,7 @@ fn cli() -> Command {
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
+        .help("Configure the remote extensions storage proxy gateway to request for extensions.")
        .required(false);

    let lsn_arg = Arg::new("lsn")
@@ -1224,6 +1273,13 @@ fn cli() -> Command {
        .help("Force initialization even if the repository is not empty")
        .required(false);

+    let num_pageservers_arg = Arg::new("num-pageservers")
+        .value_parser(value_parser!(u16))
+        .long("num-pageservers")
+        .help("How many pageservers to create (default 1)")
+        .required(false)
+        .default_value("1");
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1231,6 +1287,7 @@ fn cli() -> Command {
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
                .arg(pageserver_config_args.clone())
+                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
@@ -1258,6 +1315,7 @@ fn cli() -> Command {
            .subcommand(Command::new("create")
                .about("Create a new blank timeline")
                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(pg_version_arg.clone())
            )
@@ -1301,6 +1359,10 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -45,6 +45,7 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::RemoteExtSpec;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -476,6 +477,18 @@ impl Endpoint {
            }
        }

+        // check for file remote_extensions_spec.json
+        // if it is present, read it and pass to compute_ctl
+        let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
+        let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
+        let remote_extensions: Option<RemoteExtSpec>;
+
+        if let Ok(spec_file) = remote_extensions_spec {
+            remote_extensions = serde_json::from_reader(spec_file).ok();
+        } else {
+            remote_extensions = None;
+        };
+
        // Create spec file
        let spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
@@ -497,7 +510,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            remote_extensions: None,
+            remote_extensions,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod attachment_service;
 mod background_process;
@@ -15,3 +14,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod tenant_migration;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,10 @@ use std::{io, result};

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
+};
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -31,6 +34,9 @@ use utils::{
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
+
 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
    #[error("Reqwest error: {0}")]
@@ -98,8 +104,10 @@ impl PageServerNode {
        }
    }

-    // pageserver conf overrides defined by neon_local configuration.
-    fn neon_local_overrides(&self) -> Vec<String> {
+    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    ///
+    /// These all end up on the command line of the `pageserver` binary.
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
@@ -132,12 +140,25 @@ impl PageServerNode {
            ));
        }

+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
+            overrides.push(format!(
+                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
+            ));
+        }
+
        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
+
+        // Apply the user-provided overrides
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+
        overrides
    }

@@ -203,9 +224,6 @@ impl PageServerNode {
    }

    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
-
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
@@ -248,8 +266,7 @@ impl PageServerNode {
    ) -> Vec<Cow<'a, str>> {
        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];

-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+        let overrides = self.neon_local_overrides(config_overrides);
        for config_override in overrides {
            args.push(Cow::Borrowed("-c"));
            args.push(Cow::Owned(config_override));
@@ -392,7 +409,7 @@ impl PageServerNode {
        };

        let request = models::TenantCreateRequest {
-            new_tenant_id,
+            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
            config,
        };
@@ -501,6 +518,27 @@ impl PageServerNode {
        Ok(())
    }

+    pub fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+    ) -> anyhow::Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+
+        self.http_request(
+            Method::PUT,
+            format!(
+                "{}/tenant/{}/location_config",
+                self.http_base_url, tenant_id
+            ),
+        )?
+        .json(&req_body)
+        .send()?
+        .error_from_body()?;
+
+        Ok(())
+    }
+
    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -0,0 +1,202 @@
+//!
+//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
+//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
+//! point to the new pageserver.
+//!
+use crate::local_env::LocalEnv;
+use crate::{
+    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
+    pageserver::PageServerNode,
+};
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use std::collections::HashMap;
+use std::time::Duration;
+use utils::{
+    generation::Generation,
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+/// Given an attached pageserver, retrieve the LSN for all timelines
+fn get_lsns(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+    let timelines = pageserver.timeline_list(&tenant_id)?;
+    Ok(timelines
+        .into_iter()
+        .map(|t| (t.timeline_id, t.last_record_lsn))
+        .collect())
+}
+
+/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
+/// `baseline`.
+fn await_lsn(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+    baseline: HashMap<TimelineId, Lsn>,
+) -> anyhow::Result<()> {
+    loop {
+        let latest = match get_lsns(tenant_id, pageserver) {
+            Ok(l) => l,
+            Err(e) => {
+                println!(
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    pageserver.conf.id
+                );
+                std::thread::sleep(Duration::from_millis(500));
+                continue;
+            }
+        };
+
+        let mut any_behind: bool = false;
+        for (timeline_id, baseline_lsn) in &baseline {
+            match latest.get(timeline_id) {
+                Some(latest_lsn) => {
+                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                    if latest_lsn < baseline_lsn {
+                        any_behind = true;
+                    }
+                }
+                None => {
+                    // Expected timeline isn't yet visible on migration destination.
+                    // (IRL we would have to account for timeline deletion, but this
+                    //  is just test helper)
+                    any_behind = true;
+                }
+            }
+        }
+
+        if !any_behind {
+            println!("✅ LSN caught up.  Proceeding...");
+            break;
+        } else {
+            std::thread::sleep(Duration::from_millis(500));
+        }
+    }
+
+    Ok(())
+}
+
+/// This function spans multiple services, to demonstrate live migration of a tenant
+/// between pageservers:
+///  - Coordinate attach/secondary/detach on pageservers
+///  - call into attachment_service for generations
+///  - reconfigure compute endpoints to point to new attached pageserver
+pub fn migrate_tenant(
+    env: &LocalEnv,
+    tenant_id: TenantId,
+    dest_ps: PageServerNode,
+) -> anyhow::Result<()> {
+    // Get a new generation
+    let attachment_service = AttachmentService::from_env(env);
+
+    let previous = attachment_service.inspect(tenant_id)?;
+    let mut baseline_lsns = None;
+    if let Some((generation, origin_ps_id)) = &previous {
+        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
+
+        if origin_ps_id == &dest_ps.conf.id {
+            println!("🔁 Already attached to {origin_ps_id}, freshening...");
+            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let dest_conf = LocationConfig {
+                mode: LocationConfigMode::AttachedSingle,
+                generation: gen.map(Generation::new),
+                secondary_conf: None,
+                tenant_conf: TenantConfig::default(),
+            };
+            dest_ps.location_config(tenant_id, dest_conf)?;
+            println!("✅ Migration complete");
+            return Ok(());
+        }
+
+        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
+
+        let stale_conf = LocationConfig {
+            mode: LocationConfigMode::AttachedStale,
+            generation: Some(Generation::new(*generation)),
+            secondary_conf: None,
+            tenant_conf: TenantConfig::default(),
+        };
+        origin_ps.location_config(tenant_id, stale_conf)?;
+
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+    }
+
+    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let dest_conf = LocationConfig {
+        mode: LocationConfigMode::AttachedMulti,
+        generation: gen.map(Generation::new),
+        secondary_conf: None,
+        tenant_conf: TenantConfig::default(),
+    };
+
+    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
+    dest_ps.location_config(tenant_id, dest_conf)?;
+
+    if let Some(baseline) = baseline_lsns {
+        println!("🕑 Waiting for LSN to catch up...");
+        await_lsn(tenant_id, &dest_ps, baseline)?;
+    }
+
+    let cplane = ComputeControlPlane::load(env.clone())?;
+    for (endpoint_name, endpoint) in &cplane.endpoints {
+        if endpoint.tenant_id == tenant_id {
+            println!(
+                "🔁 Reconfiguring endpoint {} to use pageserver {}",
+                endpoint_name, dest_ps.conf.id
+            );
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+        }
+    }
+
+    for other_ps_conf in &env.pageservers {
+        if other_ps_conf.id == dest_ps.conf.id {
+            continue;
+        }
+
+        let other_ps = PageServerNode::from_env(env, other_ps_conf);
+        let other_ps_tenants = other_ps.tenant_list()?;
+
+        // Check if this tenant is attached
+        let found = other_ps_tenants
+            .into_iter()
+            .map(|t| t.id)
+            .any(|i| i == tenant_id);
+        if !found {
+            continue;
+        }
+
+        // Downgrade to a secondary location
+        let secondary_conf = LocationConfig {
+            mode: LocationConfigMode::Secondary,
+            generation: None,
+            secondary_conf: Some(LocationConfigSecondary { warm: true }),
+            tenant_conf: TenantConfig::default(),
+        };
+
+        println!(
+            "💤 Switching to secondary mode on pageserver {}",
+            other_ps.conf.id
+        );
+        other_ps.location_config(tenant_id, secondary_conf)?;
+    }
+
+    println!(
+        "🔁 Switching to AttachedSingle mode on pageserver {}",
+        dest_ps.conf.id
+    );
+    let dest_conf = LocationConfig {
+        mode: LocationConfigMode::AttachedSingle,
+        generation: gen.map(Generation::new),
+        secondary_conf: None,
+        tenant_conf: TenantConfig::default(),
+    };
+    dest_ps.location_config(tenant_id, dest_conf)?;
+
+    println!("✅ Migration complete");
+
+    Ok(())
+}
--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -177,7 +177,7 @@ I e during migration create_branch can be called on old pageserver and newly cre

 The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.

-The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+The approach largely follows this guide: <https://www.notion.so/neondatabase/Cloud-Ad-hoc-tenant-relocation-f687474f7bfc42269e6214e3acba25c7>

 The happy path sequence:

--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -17,5 +17,9 @@ postgres_ffi.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+hex.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+bincode.workspace = true
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -0,0 +1,142 @@
+use anyhow::{bail, Result};
+use byteorder::{ByteOrder, BE};
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// Key used in the Repository kv-store.
+///
+/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
+/// for what we actually store in these fields.
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+pub const KEY_SIZE: usize = 18;
+
+impl Key {
+    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
+    /// As long as Neon does not support tablespace (because of lack of access to local file system),
+    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
+    pub fn to_i128(&self) -> i128 {
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        (((self.field1 & 0xf) as i128) << 120)
+            | (((self.field2 & 0xFFFF) as i128) << 104)
+            | ((self.field3 as i128) << 72)
+            | ((self.field4 as i128) << 40)
+            | ((self.field5 as i128) << 32)
+            | self.field6 as i128
+    }
+
+    pub const fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
+    pub fn next(&self) -> Key {
+        self.add(1)
+    }
+
+    pub fn add(&self, x: u32) -> Key {
+        let mut key = *self;
+
+        let r = key.field6.overflowing_add(x);
+        key.field6 = r.0;
+        if r.1 {
+            let r = key.field5.overflowing_add(1);
+            key.field5 = r.0;
+            if r.1 {
+                let r = key.field4.overflowing_add(1);
+                key.field4 = r.0;
+                if r.1 {
+                    let r = key.field3.overflowing_add(1);
+                    key.field3 = r.0;
+                    if r.1 {
+                        let r = key.field2.overflowing_add(1);
+                        key.field2 = r.0;
+                        if r.1 {
+                            let r = key.field1.overflowing_add(1);
+                            key.field1 = r.0;
+                            assert!(!r.1);
+                        }
+                    }
+                }
+            }
+        }
+        key
+    }
+
+    pub fn from_slice(b: &[u8]) -> Self {
+        Key {
+            field1: b[0],
+            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
+            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
+            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
+            field5: b[13],
+            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
+        }
+    }
+
+    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
+        buf[0] = self.field1;
+        BE::write_u32(&mut buf[1..5], self.field2);
+        BE::write_u32(&mut buf[5..9], self.field3);
+        BE::write_u32(&mut buf[9..13], self.field4);
+        buf[13] = self.field5;
+        BE::write_u32(&mut buf[14..18], self.field6);
+    }
+}
+
+impl fmt::Display for Key {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
+            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
+        )
+    }
+}
+
+impl Key {
+    pub const MIN: Key = Key {
+        field1: u8::MIN,
+        field2: u32::MIN,
+        field3: u32::MIN,
+        field4: u32::MIN,
+        field5: u8::MIN,
+        field6: u32::MIN,
+    };
+    pub const MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    };
+
+    pub fn from_hex(s: &str) -> Result<Self> {
+        if s.len() != 36 {
+            bail!("parse error");
+        }
+        Ok(Key {
+            field1: u8::from_str_radix(&s[0..2], 16)?,
+            field2: u32::from_str_radix(&s[2..10], 16)?,
+            field3: u32::from_str_radix(&s[10..18], 16)?,
+            field4: u32::from_str_radix(&s[18..26], 16)?,
+            field5: u8::from_str_radix(&s[26..28], 16)?,
+            field6: u32::from_str_radix(&s[28..36], 16)?,
+        })
+    }
+}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,9 +1,13 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
 pub mod control_api;
+pub mod key;
 pub mod models;
 pub mod reltag;
+pub mod shard;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -16,7 +16,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::reltag::RelTag;
+use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

@@ -187,7 +187,7 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantId,
+    pub new_tenant_id: TenantShardId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generation: Option<u32>,
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -0,0 +1,321 @@
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+use utils::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct ShardCount(pub u8);
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> String {
+        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(
+                f,
+                "{}-{:02x}{:02x}",
+                self.tenant_id, self.shard_number.0, self.shard_count.0
+            )
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};
+
+    use super::*;
+
+    const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc";
+
+    #[test]
+    fn tenant_shard_id_string() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = format!("{example}");
+
+        let expected = format!("{EXAMPLE_TENANT_ID}-070a");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = bincode::serialize(&example).unwrap();
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x07, 0x0a,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize(&encoded).unwrap();
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> {
+        // Test that TenantShardId can decode a TenantId in human
+        // readable form
+        let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded.tenant_id);
+        assert_eq!(decoded.shard_count, ShardCount(0));
+        assert_eq!(decoded.shard_number, ShardNumber(0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> {
+        // Test that a legacy TenantShardId encodes into a form that
+        // can be decoded as TenantId
+        let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let example = TenantShardId::unsharded(example_tenant_id);
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantId::from_str(&encoded)?;
+
+        assert_eq!(example_tenant_id, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> {
+        // Unlike in human readable encoding, binary encoding does not
+        // do any special handling of legacy unsharded TenantIds: this test
+        // is equivalent to the main test for binary encoding, just verifying
+        // that the same behavior applies when we have used `unsharded()` to
+        // construct a TenantShardId.
+        let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap());
+        let encoded = bincode::serialize(&example).unwrap();
+
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x00, 0x00,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize::<TenantShardId>(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+}
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -15,7 +17,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -33,6 +35,11 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -608,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -730,6 +738,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                if let Err(e) = handler.process_query(self, query_string).await {
                    match e {
                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
                        e => {
                            log_query_error(query_string, &e);
                            let short_error = short_error(&e);
@@ -964,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -980,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
+aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,19 +6,15 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -34,12 +30,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -441,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -540,18 +526,6 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -633,11 +607,7 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,23 +4,27 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::{borrow::Cow, sync::Arc};

 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
+use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -83,10 +87,23 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .credentials_provider(credentials_provider)
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
+            .retry_config(retry_config.build());

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -281,6 +281,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -469,8 +470,6 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -210,6 +210,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -396,8 +397,6 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/scripts/restore_from_wal_initdb.sh
+++ b/libs/utils/scripts/restore_from_wal_initdb.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# like restore_from_wal.sh, but takes existing initdb.tar.zst
+
+set -euxo pipefail
+
+PG_BIN=$1
+WAL_PATH=$2
+DATA_DIR=$3
+PORT=$4
+echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
+declare -i WAL_SIZE=$REDO_POS+114
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+rm -f 000000010000000000000001
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md

+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
 use camino::Utf8Path;
@@ -10,7 +11,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};

-use crate::id::TenantId;
+use crate::{http::error::ApiError, id::TenantId};

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -44,31 +45,106 @@ impl Claims {
    }
 }

+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct AuthError(pub Cow<'static, str>);
+
+impl Display for AuthError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_key,
+            decoding_keys,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
    }

-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
+        } else {
+            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
+        }
    }
 }

@@ -108,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() -> Result<(), anyhow::Error> {
+    fn test_decode() {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

@@ -129,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
-
-        Ok(())
    }

    #[test]
-    fn test_encode() -> Result<(), anyhow::Error> {
+    fn test_encode() {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let decoded = auth.decode(&encoded)?;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let decoded = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
-
-        Ok(())
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth
-                        .decode(token)
-                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
+                    let data = auth.decode(token).map_err(|err| {
+                        warn!("Authentication error: {err}");
+                        // Rely on From<AuthError> for ApiError impl
+                        err
+                    })?;
                    req.set_context(data.claims);
                }
                None => {
@@ -450,12 +452,11 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
+    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => {
-            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
-        }
+        Some(claims) => Ok(check_permission(&claims)
+            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::{error, info, warn};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
+        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
+            warn!("Error processing HTTP request: {api_error:#}")
+        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,6 +120,8 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -66,9 +66,17 @@ pub enum TracingErrorLayerEnablement {
    EnableWithRustLogFilter,
 }

+/// Where the logging should output to.
+#[derive(Clone, Copy)]
+pub enum Output {
+    Stdout,
+    Stderr,
+}
+
 pub fn init(
    log_format: LogFormat,
    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+    output: Output,
 ) -> anyhow::Result<()> {
    // We fall back to printing all spans at info-level or above if
    // the RUST_LOG environment variable is not set.
@@ -85,7 +93,12 @@ pub fn init(
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
            .with_ansi(false)
-            .with_writer(std::io::stdout);
+            .with_writer(move || -> Box<dyn std::io::Write> {
+                match output {
+                    Output::Stdout => Box::new(std::io::stdout()),
+                    Output::Stderr => Box::new(std::io::stderr()),
+                }
+            });
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
            LogFormat::Plain => log_layer.boxed(),
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,6 +125,9 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

+            // Block any future waiters from starting
+            internal.shutdown = true;
+
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -85,6 +85,13 @@ impl Gate {
        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
    }

+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
    async fn do_close(&self) {
        tracing::debug!(gate = self.name, "Closing Gate...");
        match self.sem.acquire_many(Self::MAX_UNITS).await {
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
    }
 }

+#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
 }

 /// Take ownership of `Vec<u8>` from StringInfoData.
+#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -18,3 +18,5 @@ tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+serde.workspace = true
+serde_json.workspace = true
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -0,0 +1,38 @@
+use std::collections::HashMap;
+
+use anyhow::Context;
+use camino::Utf8PathBuf;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
+use utils::lsn::Lsn;
+
+#[derive(clap::Subcommand)]
+pub(crate) enum IndexPartCmd {
+    Dump { path: Utf8PathBuf },
+}
+
+pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
+    match cmd {
+        IndexPartCmd::Dump { path } => {
+            let bytes = tokio::fs::read(path).await.context("read file")?;
+            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
+            #[derive(serde::Serialize)]
+            struct Output<'a> {
+                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                disk_consistent_lsn: Lsn,
+                timeline_metadata: &'a TimelineMetadata,
+            }
+
+            let output = Output {
+                layer_metadata: &des.layer_metadata,
+                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                timeline_metadata: &des.metadata,
+            };
+
+            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
+            println!("{output}");
+            Ok(())
+        }
+    }
+}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -5,11 +5,13 @@
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.

 mod draw_timeline_dir;
+mod index_part;
 mod layer_map_analyzer;
 mod layers;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
    context::{DownloadBehavior, RequestContext},
@@ -38,6 +40,8 @@ struct CliOpts {
 #[derive(Subcommand)]
 enum Commands {
    Metadata(MetadataCmd),
+    #[command(subcommand)]
+    IndexPart(IndexPartCmd),
    PrintLayerFile(PrintLayerFileCmd),
    DrawTimeline {},
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
@@ -83,6 +87,9 @@ async fn main() -> anyhow::Result<()> {
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
        }
+        Commands::IndexPart(cmd) => {
+            index_part::main(&cmd).await?;
+        }
        Commands::DrawTimeline {} => {
            draw_timeline_dir::main()?;
        }
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,22 +1,21 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => {
-            bail!("SafekeeperData scope makes no sense for Pageserver")
-        }
+        (Scope::SafekeeperData, _) => Err(AuthError(
+            "SafekeeperData scope makes no sense for Pageserver".into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -100,7 +103,11 @@ fn main() -> anyhow::Result<()> {
    } else {
        TracingErrorLayerEnablement::Disabled
    };
-    logging::init(conf.log_format, tracing_error_layer_enablement)?;
+    logging::init(
+        conf.log_format,
+        tracing_error_layer_enablement,
+        logging::Output::Stdout,
+    )?;

    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
@@ -321,13 +328,12 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -410,7 +416,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -420,6 +426,7 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -548,6 +555,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
@@ -617,6 +625,7 @@ fn start_pageserver(
                    conf.synthetic_size_calculation_interval,
                    conf.id,
                    local_disk_storage,
+                    cancel,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs: NonZeroUsize::new(
-                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
-                    )
-                        .unwrap(),
-                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
-                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs,
-                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;

@@ -37,6 +38,7 @@ type RawMetric = (MetricsKey, (EventType, u64));
 type Cache = HashMap<MetricsKey, (EventType, u64)>;

 /// Main thread that serves metrics collection
+#[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
@@ -44,6 +46,7 @@ pub async fn collect_metrics(
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
+    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    if _cached_metric_collection_interval != Duration::ZERO {
@@ -63,9 +66,13 @@ pub async fn collect_metrics(
        "synthetic size calculation",
        false,
        async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
+            calculate_synthetic_size_worker(
+                synthetic_size_calculation_interval,
+                &cancel,
+                &worker_ctx,
+            )
+            .instrument(info_span!("synthetic_size_worker"))
+            .await?;
            Ok(())
        },
    );
@@ -241,6 +248,7 @@ async fn reschedule(
 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
+    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
@@ -272,7 +280,12 @@ async fn calculate_synthetic_size_worker(
                // Same for the loop that fetches computed metrics.
                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                // which turns out is really handy to understand the system.
-                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
+                if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
+                    if let Some(PageReconstructError::Cancelled) =
+                        e.downcast_ref::<PageReconstructError>()
+                    {
+                        return Ok(());
+                    }
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
            }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -513,6 +513,7 @@ impl DeletionQueueClient {
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");
+
            let mut layer_paths = Vec::new();
            for (layer, generation) in layers {
                layer_paths.push(remote_layer_path(
@@ -893,14 +894,6 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
            |_| false,
            3,
            10,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,6 +52,31 @@ paths:
              schema:
                type: object

+  /v1/reload_auth_validation_keys:
+    post:
+      description: Reloads the JWT public keys from their pre-configured location on disk.
+      responses:
+        "200":
+          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -327,7 +352,8 @@ paths:
          in: query
          required: true
          schema:
-            type: integer
+            type: string
+            format: hex
          description: A LSN to get the timestamp
      responses:
        "200":
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -6,6 +6,7 @@ use std::str::FromStr;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
+use enumset::EnumSet;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -16,10 +17,12 @@ use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
 };
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -35,17 +38,18 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
+use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -63,7 +67,8 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    auth: Option<Arc<JwtAuth>>,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -74,7 +79,8 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        auth: Option<Arc<JwtAuth>>,
+        tenant_manager: Arc<TenantManager>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -86,6 +92,7 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
+            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -273,7 +280,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
        use crate::tenant::delete::DeleteTenantError::*;
        match value {
            Get(g) => ApiError::from(g),
-            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
            SlotError(e) => e.into(),
@@ -299,11 +305,7 @@ async fn build_timeline_info(
        // we're executing this function, we will outlive the timeline on-disk state.
        info.current_logical_size_non_incremental = Some(
            timeline
-                .get_current_logical_size_non_incremental(
-                    info.last_record_lsn,
-                    CancellationToken::new(),
-                    ctx,
-                )
+                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
                .await?,
        );
    }
@@ -389,13 +391,39 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

+async fn reload_auth_validation_keys_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let config = get_config(&request);
+    let state = get_state(&request);
+    let Some(shared_auth) = &state.auth else {
+        return json_response(StatusCode::BAD_REQUEST, ());
+    };
+    // unwrap is ok because check is performed when creating config, so path is set and exists
+    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
+    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
+
+    match JwtAuth::from_key_path(key_path) {
+        Ok(new_auth) => {
+            shared_auth.swap(new_auth);
+            json_response(StatusCode::OK, ())
+        }
+        Err(e) => {
+            warn!("Error reloading public keys from {key_path:?}: {e:}");
+            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+        }
+    }
+}
+
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let new_timeline_id = request_data.new_timeline_id;

@@ -404,7 +432,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -438,7 +466,10 @@ async fn timeline_create_handler(
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard = %tenant_shard_id.shard_slug(),
+        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

@@ -518,7 +549,7 @@ async fn timeline_detail_handler(

 async fn get_lsn_by_timestamp_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -534,7 +565,9 @@ async fn get_lsn_by_timestamp_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
+    let result = timeline
+        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
+        .await?;

    if version.unwrap_or(0) > 1 {
        #[derive(serde::Serialize)]
@@ -634,14 +667,15 @@ async fn timeline_delete_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);

-    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
+    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
+        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -655,11 +689,14 @@ async fn tenant_detach_handler(
    check_permission(&request, Some(tenant_id))?;
    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;

+    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let state = get_state(&request);
    let conf = state.conf;
    mgr::detach_tenant(
        conf,
-        tenant_id,
+        tenant_shard_id,
        detach_ignored.unwrap_or(false),
        &state.deletion_queue_client,
    )
@@ -776,13 +813,16 @@ async fn tenant_delete_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    // TODO openapi spec
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);

-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
-        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
+        .instrument(info_span!("tenant_delete_handler",
+            tenant_id = %tenant_shard_id.tenant_id,
+            shard = tenant_shard_id.shard_slug()
+        ))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -803,7 +843,7 @@ async fn tenant_delete_handler(
 /// without modifying anything anyway.
 async fn tenant_size_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -819,6 +859,7 @@ async fn tenant_size_handler(
        .gather_size_inputs(
            retention_period,
            LogicalSizeCalculationCause::TenantSizeHandler,
+            &cancel,
            &ctx,
        )
        .await
@@ -1112,9 +1153,10 @@ async fn put_tenant_location_config_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
-    let tenant_id = request_data.tenant_id;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let state = get_state(&request);
@@ -1123,9 +1165,13 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
-            .instrument(info_span!("tenant_detach", %tenant_id))
-            .await
+        if let Err(e) =
+            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+                .instrument(info_span!("tenant_detach",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard = tenant_shard_id.shard_slug()
+                ))
+                .await
        {
            match e {
                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1140,20 +1186,14 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
+    state
+        .tenant_manager
+        .upsert_location(tenant_shard_id, location_conf, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1204,7 +1244,7 @@ async fn failpoints_handler(
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1213,7 +1253,7 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1232,11 +1272,15 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let mut flags = EnumSet::empty();
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
+        flags |= CompactFlags::ForceRepartition;
+    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
@@ -1253,6 +1297,11 @@ async fn timeline_checkpoint_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
+
+    let mut flags = EnumSet::empty();
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
+        flags |= CompactFlags::ForceRepartition;
+    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
@@ -1261,7 +1310,7 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

@@ -1458,7 +1507,7 @@ async fn timeline_collect_keyspace(
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
@@ -1639,14 +1688,32 @@ where
                let token_cloned = token.clone();
                let result = handler(r, token).await;
                if token_cloned.is_cancelled() {
-                    info!("Cancelled request finished");
+                    // dropguard has executed: we will never turn this result into response.
+                    //
+                    // at least temporarily do {:?} logging; these failures are rare enough but
+                    // could hide difficult errors.
+                    match &result {
+                        Ok(response) => {
+                            let status = response.status();
+                            info!(%status, "Cancelled request finished successfully")
+                        }
+                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                    }
                }
+                // only logging for cancelled panicked request handlers is the tracing_panic_hook,
+                // which should suffice.
+                //
+                // there is still a chance to lose the result due to race between
+                // returning from here and the actual connection closing happening
+                // before outer task gets to execute. leaving that up for #5815.
                result
            }
            .in_current_span(),
        );

        match handle.await {
+            // TODO: never actually return Err from here, always Ok(...) so that we can log
+            // spanned errors. Call api_error_handler instead and return appropriate Body.
            Ok(result) => result,
            Err(e) => {
                // The handler task panicked. We have a global panic handler that logs the
@@ -1695,7 +1762,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1724,10 +1791,13 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .post("/v1/reload_auth_validation_keys", |r| {
+            api_handler(r, reload_auth_validation_keys_handler)
+        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
-        .delete("/v1/tenant/:tenant_id", |r| {
+        .delete("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
@@ -1739,13 +1809,13 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
-        .post("/v1/tenant/:tenant_id/timeline", |r| {
+        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_create_handler)
        })
        .post("/v1/tenant/:tenant_id/attach", |r| {
@@ -1789,7 +1859,7 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
-        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+        .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -3,18 +3,25 @@
 //! a neon Timeline.
 //!
 use std::path::{Path, PathBuf};
+use std::pin::Pin;
+use std::task::{self, Poll};

 use anyhow::{bail, ensure, Context, Result};
+use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use tokio::io::{AsyncRead, AsyncReadExt};
+use nix::NixPath;
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio_tar::Archive;
+use tokio_tar::Builder;
+use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
+use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -33,7 +40,9 @@ use utils::lsn::Lsn;
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
    // Read control file to extract the LSN
    let controlfile_path = path.join("global").join("pg_control");
-    let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
+    let controlfile_buf = std::fs::read(&controlfile_path)
+        .with_context(|| format!("reading controlfile: {controlfile_path}"))?;
+    let controlfile = ControlFileData::decode(&controlfile_buf)?;
    let lsn = controlfile.checkPoint;

    Ok(Lsn(lsn))
@@ -618,3 +627,108 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    reader.read_to_end(&mut buf).await?;
    Ok(Bytes::from(buf))
 }
+
+/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
+///
+/// The number of yields is bounded by above by the number of times poll_write is called,
+/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
+/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
+/// breathing room between units of CPU intensive preparation of buffers to be written.
+/// Once a write call is issued, the whole buffer has been prepared already, so there is no
+/// gain in splitting up the memcopy further.
+struct YieldingVec {
+    yield_budget: usize,
+    // the buffer written into
+    buf: Vec<u8>,
+}
+
+impl YieldingVec {
+    fn new() -> Self {
+        Self {
+            yield_budget: 0,
+            buf: Vec::new(),
+        }
+    }
+    // Whether we should yield for a read operation of given size
+    fn should_yield(&mut self, add_buf_len: usize) -> bool {
+        // Set this limit to a small value so that we are a
+        // good async citizen and yield repeatedly (but not
+        // too often for many small writes to cause many yields)
+        const YIELD_DIST: usize = 1024;
+
+        let target_buf_len = self.buf.len() + add_buf_len;
+        let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
+        if self.yield_budget < target_buf_len {
+            self.yield_budget += add_buf_len;
+        }
+        ret
+    }
+}
+
+impl AsyncWrite for YieldingVec {
+    fn poll_write(
+        mut self: Pin<&mut Self>,
+        cx: &mut task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<std::io::Result<usize>> {
+        if self.should_yield(buf.len()) {
+            cx.waker().wake_by_ref();
+            return Poll::Pending;
+        }
+        self.get_mut().buf.extend_from_slice(buf);
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        _cx: &mut task::Context<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        Poll::Ready(Ok(()))
+    }
+}
+
+pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(pgdata_path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        YieldingVec::new(),
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for path in paths {
+        let rel_path = path.strip_prefix(pgdata_path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&path, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let compressed = zstd.into_inner();
+    let compressed_len = compressed.buf.len();
+    const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
+    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
+        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
+    }
+    Ok(compressed.buf)
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 mod auth;
 pub mod basebackup;
 pub mod config;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -638,7 +638,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 ///
 /// Operations:
 /// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
+/// - close (dropping [`crate::virtual_file::VirtualFile`])
 /// - close-by-replace (close by replacement algorithm)
 /// - read (`read_at`)
 /// - write (`write_at`)
@@ -1225,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the Postgres WAL redo process",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
@@ -1261,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

+pub(crate) struct WalRedoProcessCounters {
+    pub(crate) started: IntCounter,
+    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+}
+
+#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub(crate) enum WalRedoKillCause {
+    WalRedoProcessDrop,
+    NoLeakChildDrop,
+    Startup,
+}
+
+impl Default for WalRedoProcessCounters {
+    fn default() -> Self {
+        let started = register_int_counter!(
+            "pageserver_wal_redo_process_started_total",
+            "Number of WAL redo processes started",
+        )
+        .unwrap();
+
+        let killed = register_int_counter_vec!(
+            "pageserver_wal_redo_process_stopped_total",
+            "Number of WAL redo processes stopped",
+            &["cause"],
+        )
+        .unwrap();
+        Self {
+            started,
+            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
+                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause_str: &'static str = cause.into();
+                killed.with_label_values(&[cause_str])
+            })),
+        }
+    }
+}
+
+pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
+    Lazy::new(WalRedoProcessCounters::default);
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -1928,7 +1959,6 @@ pub fn preinitialize_metrics() {
        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
    ]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -39,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, JwtAuth, Scope},
+    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -121,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -189,7 +190,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -217,9 +218,27 @@ async fn page_service_conn_main(
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);

-    // timeout should be lower, but trying out multiple days for
-    // <https://github.com/neondatabase/neon/issues/4205>
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
+    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
+    let socket_timeout_ms = (|| {
+        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
+            // Exponential distribution for simulating
+            // poor network conditions, expect about avg_timeout_ms to be around 15
+            // in tests
+            if let Some(avg_timeout_ms) = avg_timeout_ms {
+                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
+                let u = rand::random::<f32>();
+                ((1.0 - u).ln() / (-avg)) as u64
+            } else {
+                default_timeout_ms
+            }
+        });
+        default_timeout_ms
+    })();
+
+    // A timeout here does not mean the client died, it can happen if it's just idle for
+    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
+    // they reconnect.
+    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

    // XXX: pgbackend.run() should take the connection_ctx,
@@ -252,7 +271,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -266,7 +285,7 @@ impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<JwtAuth>>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
@@ -493,7 +512,11 @@ impl PageServerHandler {
            };

            if let Err(e) = &response {
-                if timeline.cancel.is_cancelled() {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
@@ -897,7 +920,7 @@ impl PageServerHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -909,7 +932,7 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id)
+        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
    }

    /// Shorthand for getting a reference to a Timeline of an Active tenant.
@@ -948,16 +971,17 @@ where
            .auth
            .as_ref()
            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .map_err(|e| QueryError::Unauthorized(e.0))?;

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing"
-            )));
+            return Err(QueryError::Unauthorized(
+                "jwt token scope is Tenant, but tenant id is missing".into(),
+            ));
        }

-        info!(
-            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -979,9 +1003,13 @@ where
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
+        fail::fail_point!("simulated-bad-compute-connection", |_| {
+            info!("Hit failpoint for bad connection");
+            Err(QueryError::SimulatedConnectionError)
+        });
+
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1330,6 +1358,9 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                QueryError::Shutdown
+            }
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,6 +23,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
+use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
@@ -30,9 +31,33 @@ pub type BlockNumber = u32;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
+    /// Found commits both before and after the given timestamp
    Present(Lsn),
+
+    /// Found no commits after the given timestamp, this means
+    /// that the newest data in the branch is older than the given
+    /// timestamp.
+    ///
+    /// All commits <= LSN happened before the given timestamp
    Future(Lsn),
+
+    /// The queried timestamp is past our horizon we look back at (PITR)
+    ///
+    /// All commits > LSN happened after the given timestamp,
+    /// but any commits < LSN might have happened before or after
+    /// the given timestamp. We don't know because no data before
+    /// the given lsn is available.
    Past(Lsn),
+
+    /// We have found no commit with a timestamp,
+    /// so we can't return anything meaningful.
+    ///
+    /// The associated LSN is the lower bound value we can safely
+    /// create branches on, but no statement is made if it is
+    /// older or newer than the timestamp.
+    ///
+    /// This variant can e.g. be returned right after a
+    /// cluster import.
    NoData(Lsn),
 }

@@ -44,6 +69,25 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum CollectKeySpaceError {
+    #[error(transparent)]
+    Decode(#[from] DeserializeError),
+    #[error(transparent)]
+    PageRead(PageReconstructError),
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl From<PageReconstructError> for CollectKeySpaceError {
+    fn from(err: PageReconstructError) -> Self {
+        match err {
+            PageReconstructError::Cancelled => Self::Cancelled,
+            err => Self::PageRead(err),
+        }
+    }
+}
+
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
@@ -322,10 +366,15 @@ impl Timeline {
    pub async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
-        let min_lsn = *gc_cutoff_lsn_guard;
+        // We use this method to figure out the branching LSN for the new branch, but the
+        // GC cutoff could be before the branching point and we cannot create a new branch
+        // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
+        // on the safe side.
+        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
        let max_lsn = self.get_last_record_lsn();

        // LSNs are always 8-byte aligned. low/mid/high represent the
@@ -336,6 +385,9 @@ impl Timeline {
        let mut found_smaller = false;
        let mut found_larger = false;
        while low < high {
+            if cancel.is_cancelled() {
+                return Err(PageReconstructError::Cancelled);
+            }
            // cannot overflow, high and low are both smaller than u64::MAX / 2
            let mid = (high + low) / 2;

@@ -355,30 +407,33 @@ impl Timeline {
                low = mid + 1;
            }
        }
+        // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
+        // so the LSN of the last commit record before or at `search_timestamp`.
+        // Remove one from `low` to get `t`.
+        //
+        // FIXME: it would be better to get the LSN of the previous commit.
+        // Otherwise, if you restore to the returned LSN, the database will
+        // include physical changes from later commits that will be marked
+        // as aborted, and will need to be vacuumed away.
+        let commit_lsn = Lsn((low - 1) * 8);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
                // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
-            }
-            (true, false) => {
-                // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                Ok(LsnForTimestamp::NoData(min_lsn))
            }
            (false, true) => {
                // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                Ok(LsnForTimestamp::Past(min_lsn))
            }
-            (true, true) => {
-                // low is the LSN of the first commit record *after* the search_timestamp,
-                // Back off by one to get to the point just before the commit.
-                //
-                // FIXME: it would be better to get the LSN of the previous commit.
-                // Otherwise, if you restore to the returned LSN, the database will
-                // include physical changes from later commits that will be marked
-                // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+            (true, false) => {
+                // Only found commits with timestamps smaller than the request.
+                // It's still a valid case for branch creation, return it.
+                // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
+                // case, anyway.
+                Ok(LsnForTimestamp::Future(commit_lsn))
            }
+            (true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
        }
    }

@@ -578,7 +633,6 @@ impl Timeline {
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
-        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -589,12 +643,8 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, lsn, ctx)
-                .await
-                .context("list rels")?
-            {
-                if cancel.is_cancelled() {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
@@ -611,11 +661,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    pub async fn collect_keyspace(
+    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> anyhow::Result<KeySpace> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -624,7 +674,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
+        let dbdir = DbDirectory::des(&buf)?;

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
        dbs.sort_unstable();
@@ -657,7 +707,7 @@ impl Timeline {
            let slrudir_key = slru_dir_to_key(kind);
            result.add_key(slrudir_key);
            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
+            let dir = SlruSegmentDirectory::des(&buf)?;
            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
            segments.sort_unstable();
            for segno in segments {
@@ -675,7 +725,7 @@ impl Timeline {
        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
        for xid in xids {
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,106 +1,11 @@
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, Result};
-use byteorder::{ByteOrder, BE};
+use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::fmt;
 use std::ops::{AddAssign, Range};
 use std::time::Duration;

-/// Key used in the Repository kv-store.
-///
-/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
-/// for what we actually store in these fields.
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
-pub struct Key {
-    pub field1: u8,
-    pub field2: u32,
-    pub field3: u32,
-    pub field4: u32,
-    pub field5: u8,
-    pub field6: u32,
-}
-
-pub const KEY_SIZE: usize = 18;
-
-impl Key {
-    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
-    /// As long as Neon does not support tablespace (because of lack of access to local file system),
-    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
-    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
-            | (((self.field2 & 0xFFFF) as i128) << 104)
-            | ((self.field3 as i128) << 72)
-            | ((self.field4 as i128) << 40)
-            | ((self.field5 as i128) << 32)
-            | self.field6 as i128
-    }
-
-    pub const fn from_i128(x: i128) -> Self {
-        Key {
-            field1: ((x >> 120) & 0xf) as u8,
-            field2: ((x >> 104) & 0xFFFF) as u32,
-            field3: (x >> 72) as u32,
-            field4: (x >> 40) as u32,
-            field5: (x >> 32) as u8,
-            field6: x as u32,
-        }
-    }
-
-    pub fn next(&self) -> Key {
-        self.add(1)
-    }
-
-    pub fn add(&self, x: u32) -> Key {
-        let mut key = *self;
-
-        let r = key.field6.overflowing_add(x);
-        key.field6 = r.0;
-        if r.1 {
-            let r = key.field5.overflowing_add(1);
-            key.field5 = r.0;
-            if r.1 {
-                let r = key.field4.overflowing_add(1);
-                key.field4 = r.0;
-                if r.1 {
-                    let r = key.field3.overflowing_add(1);
-                    key.field3 = r.0;
-                    if r.1 {
-                        let r = key.field2.overflowing_add(1);
-                        key.field2 = r.0;
-                        if r.1 {
-                            let r = key.field1.overflowing_add(1);
-                            key.field1 = r.0;
-                            assert!(!r.1);
-                        }
-                    }
-                }
-            }
-        }
-        key
-    }
-
-    pub fn from_slice(b: &[u8]) -> Self {
-        Key {
-            field1: b[0],
-            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
-            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
-            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
-            field5: b[13],
-            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
-        }
-    }
-
-    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
-        buf[0] = self.field1;
-        BE::write_u32(&mut buf[1..5], self.field2);
-        BE::write_u32(&mut buf[5..9], self.field3);
-        BE::write_u32(&mut buf[9..13], self.field4);
-        buf[13] = self.field5;
-        BE::write_u32(&mut buf[14..18], self.field6);
-    }
-}
+pub use pageserver_api::key::{Key, KEY_SIZE};

 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
@@ -129,51 +34,9 @@ pub fn singleton_range(key: Key) -> Range<Key> {
    key..key.next()
 }

-impl fmt::Display for Key {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
-            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
-        )
-    }
-}
-
-impl Key {
-    pub const MIN: Key = Key {
-        field1: u8::MIN,
-        field2: u32::MIN,
-        field3: u32::MIN,
-        field4: u32::MIN,
-        field5: u8::MIN,
-        field6: u32::MIN,
-    };
-    pub const MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
-        field3: u32::MAX,
-        field4: u32::MAX,
-        field5: u8::MAX,
-        field6: u32::MAX,
-    };
-
-    pub fn from_hex(s: &str) -> Result<Self> {
-        if s.len() != 36 {
-            bail!("parse error");
-        }
-        Ok(Key {
-            field1: u8::from_str_radix(&s[0..2], 16)?,
-            field2: u32::from_str_radix(&s[2..10], 16)?,
-            field3: u32::from_str_radix(&s[10..18], 16)?,
-            field4: u32::from_str_radix(&s[18..26], 16)?,
-            field5: u8::from_str_radix(&s[26..28], 16)?,
-            field6: u32::from_str_radix(&s[28..36], 16)?,
-        })
-    }
-}
-
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(test, derive(PartialEq))]
 pub enum Value {
    /// An Image value contains a full copy of the value
    Image(Bytes),
@@ -197,6 +60,70 @@ impl Value {
    }
 }

+#[cfg(test)]
+mod test {
+    use super::*;
+
+    use bytes::Bytes;
+    use utils::bin_ser::BeSer;
+
+    macro_rules! roundtrip {
+        ($orig:expr, $expected:expr) => {{
+            let orig: Value = $orig;
+
+            let actual = Value::ser(&orig).unwrap();
+            let expected: &[u8] = &$expected;
+
+            assert_eq!(utils::Hex(&actual), utils::Hex(expected));
+
+            let deser = Value::des(&actual).unwrap();
+
+            assert_eq!(orig, deser);
+        }};
+    }
+
+    #[test]
+    fn image_roundtrip() {
+        let image = Bytes::from_static(b"foobar");
+        let image = Value::Image(image);
+
+        #[rustfmt::skip]
+        let expected = [
+            // top level discriminator of 4 bytes
+            0x00, 0x00, 0x00, 0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+            // foobar
+            0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
+        ];
+
+        roundtrip!(image, expected);
+    }
+
+    #[test]
+    fn walrecord_postgres_roundtrip() {
+        let rec = NeonWalRecord::Postgres {
+            will_init: true,
+            rec: Bytes::from_static(b"foobar"),
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // flattened discriminator of total 8 bytes
+            0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+            // will_init
+            0x01,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+            // foobar
+            0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
+        ];
+
+        roundtrip!(rec, expected);
+    }
+}
+
 ///
 /// Result of performing GC
 ///
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,7 +12,9 @@
 //!

 use anyhow::{bail, Context};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
+use enumset::EnumSet;
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
@@ -23,6 +25,7 @@ use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
@@ -254,8 +257,6 @@ pub struct Tenant {

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

-    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
-
    // Cancellation token fires when we have entered shutdown().  This is a parent of
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,
@@ -291,6 +292,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
+    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
+        match self {
+            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+
    pub async fn request_redo(
        &self,
        key: crate::repository::Key,
@@ -621,9 +632,9 @@ impl Tenant {
                    }
                };

-                info!("pending_deletion {}", pending_deletion.is_some());
+                info!("pending_deletion {}", pending_deletion);

-                if let Some(deletion) = pending_deletion {
+                if pending_deletion {
                    // as we are no longer loading, signal completion by dropping
                    // the completion while we resume deletion
                    drop(_completion);
@@ -640,7 +651,6 @@ impl Tenant {
                    }

                    match DeleteTenantFlow::resume_from_attach(
-                        deletion,
                        &tenant_clone,
                        preload,
                        tenants,
@@ -720,7 +730,7 @@ impl Tenant {
    ///
    async fn attach(
        self: &Arc<Tenant>,
-        mut init_order: Option<InitializationOrder>,
+        init_order: Option<InitializationOrder>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -737,11 +747,6 @@ impl Tenant {
            }
        };

-        // Signal that we have completed remote phase
-        init_order
-            .as_mut()
-            .and_then(|x| x.initial_tenant_load_remote.take());
-
        let mut timelines_to_resume_deletions = vec![];

        let mut remote_index_and_client = HashMap::new();
@@ -1619,6 +1624,7 @@ impl Tenant {
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<GcResult> {
        // Don't start doing work during shutdown
@@ -1641,7 +1647,7 @@ impl Tenant {
            }
        }

-        self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }

@@ -1649,22 +1655,16 @@ impl Tenant {
    /// This function is periodically called by compactor task.
    /// Also it can be explicitly requested per timeline through page server
    /// api's 'compact' command.
-    pub async fn compaction_iteration(
+    async fn compaction_iteration(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Don't start doing work during shutdown
-        if let TenantState::Stopping { .. } = self.current_state() {
+    ) -> anyhow::Result<(), timeline::CompactionError> {
+        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
+        if !self.is_active() {
            return Ok(());
        }

-        // We should only be called once the tenant has activated.
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot run compaction iteration on inactive tenant"
-        );
-
        {
            let conf = self.tenant_conf.read().unwrap();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
@@ -1695,7 +1695,7 @@ impl Tenant {

        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
-                .compact(cancel, ctx)
+                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
                .await?;
        }
@@ -1841,9 +1841,16 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let span = Span::current();
-                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
+                js.spawn(async move {
+                    if freeze_and_flush {
+                        timeline.flush_and_shutdown().instrument(span).await
+                    } else {
+                        timeline.shutdown().instrument(span).await
+                    }
+                });
            })
        };
+        // test_long_timeline_create_then_tenant_delete is leaning on this message
        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
            match res {
@@ -2362,7 +2369,6 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
            cancel: CancellationToken::default(),
            gate: Gate::new(format!("Tenant<{tenant_id}>")),
        }
@@ -2558,14 +2564,30 @@ impl Tenant {
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<GcResult> {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

-        let gc_timelines = self
-            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
-            .await?;
+        let gc_timelines = match self
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
+            .await
+        {
+            Ok(result) => result,
+            Err(e) => {
+                if let Some(PageReconstructError::Cancelled) =
+                    e.downcast_ref::<PageReconstructError>()
+                {
+                    // Handle cancellation
+                    totals.elapsed = now.elapsed();
+                    return Ok(totals);
+                } else {
+                    // Propagate other errors
+                    return Err(e);
+                }
+            }
+        };

        crate::failpoint_support::sleep_millis_async!(
            "gc_iteration_internal_after_getting_gc_timelines"
@@ -2589,7 +2611,7 @@ impl Tenant {
        // See comments in [`Tenant::branch_timeline`] for more information
        // about why branch creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() {
+            if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
                // We were requested to shut down. Stop and return with the progress we
                // made.
                break;
@@ -2609,6 +2631,7 @@ impl Tenant {
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
    pub async fn refresh_gc_info(
        &self,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // since this method can now be called at different rates than the configured gc loop, it
@@ -2620,7 +2643,7 @@ impl Tenant {
        // refresh all timelines
        let target_timeline_id = None;

-        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }

@@ -2629,6 +2652,7 @@ impl Tenant {
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // grab mutex to prevent new timelines from being created here.
@@ -2702,7 +2726,7 @@ impl Tenant {
                    .map(|&x| x.1)
                    .collect();
                timeline
-                    .update_gc_info(branchpoints, cutoff, pitr, ctx)
+                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
                    .await?;

                gc_timelines.push(timeline);
@@ -2865,7 +2889,7 @@ impl Tenant {
    }

    /// - run initdb to init temporary instance and get bootstrap data
-    /// - after initialization complete, remove the temp dir.
+    /// - after initialization completes, tar up the temp dir and upload it to S3.
    ///
    /// The caller is responsible for activating the returned timeline.
    async fn bootstrap_timeline(
@@ -2906,6 +2930,30 @@ impl Tenant {
        let pgdata_path = &initdb_path;
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align();

+        // Upload the created data dir to S3
+        if let Some(storage) = &self.remote_storage {
+            let pgdata_zstd = import_datadir::create_tar_zst(pgdata_path).await?;
+            let pgdata_zstd = Bytes::from(pgdata_zstd);
+            backoff::retry(
+                || async {
+                    self::remote_timeline_client::upload_initdb_dir(
+                        storage,
+                        &self.tenant_id,
+                        &timeline_id,
+                        pgdata_zstd.clone(),
+                    )
+                    .await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "persist_initdb_tar_zst",
+                // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+                backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            )
+            .await?;
+        }
+
        // Import the contents of the data directory at the initial checkpoint
        // LSN, and any WAL after that.
        // Initdb lsn will be equal to last_record_lsn which will be set after import.
@@ -3115,6 +3163,7 @@ impl Tenant {
        // (only if it is shorter than the real cutoff).
        max_retention_period: Option<u64>,
        cause: LogicalSizeCalculationCause,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<size::ModelInputs> {
        let logical_sizes_at_once = self
@@ -3137,6 +3186,7 @@ impl Tenant {
            max_retention_period,
            &mut shared_cache,
            cause,
+            cancel,
            ctx,
        )
        .await
@@ -3149,9 +3199,10 @@ impl Tenant {
    pub async fn calculate_synthetic_size(
        &self,
        cause: LogicalSizeCalculationCause,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(None, cause, ctx).await?;
+        let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;

        let size = inputs.calculate()?;

@@ -3494,6 +3545,7 @@ pub(crate) mod harness {
                // enable it in case the tests exercise code paths that use
                // debug_assert_current_span_has_tenant_and_timeline_id
                logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+                logging::Output::Stdout,
            )
            .expect("Failed to init test logging")
        });
@@ -3528,10 +3580,6 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -3926,7 +3974,13 @@ mod tests {
        // and compaction works. But it does set the 'cutoff' point so that the cross check
        // below should fail.
        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
+            .gc_iteration(
+                Some(TIMELINE_ID),
+                0x10,
+                Duration::ZERO,
+                &CancellationToken::new(),
+                &ctx,
+            )
            .await?;

        // try to branch at lsn 25, should fail because we already garbage collected the data
@@ -4029,7 +4083,13 @@ mod tests {
        tline.set_broken("test".to_owned());

        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
+            .gc_iteration(
+                Some(TIMELINE_ID),
+                0x10,
+                Duration::ZERO,
+                &CancellationToken::new(),
+                &ctx,
+            )
            .await?;

        // The branchpoints should contain all timelines, even ones marked
@@ -4075,7 +4135,13 @@ mod tests {
            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
+            .gc_iteration(
+                Some(TIMELINE_ID),
+                0x10,
+                Duration::ZERO,
+                &CancellationToken::new(),
+                &ctx,
+            )
            .await?;
        assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());

@@ -4103,7 +4169,13 @@ mod tests {

        // run gc on parent
        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
+            .gc_iteration(
+                Some(TIMELINE_ID),
+                0x10,
+                Duration::ZERO,
+                &CancellationToken::new(),
+                &ctx,
+            )
            .await?;

        // Check that the data is still accessible on the branch.
@@ -4292,7 +4364,9 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline
+            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            .await?;

        let writer = tline.writer().await;
        writer
@@ -4307,7 +4381,9 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline
+            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            .await?;

        let writer = tline.writer().await;
        writer
@@ -4322,7 +4398,9 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline
+            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            .await?;

        let writer = tline.writer().await;
        writer
@@ -4337,7 +4415,9 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline
+            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            .await?;

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
@@ -4405,10 +4485,18 @@ mod tests {
            let cutoff = tline.get_last_record_lsn();

            tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&CancellationToken::new(), &ctx).await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
            tline.gc().await?;
        }

@@ -4485,10 +4573,18 @@ mod tests {
            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
            tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&CancellationToken::new(), &ctx).await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
            tline.gc().await?;
        }

@@ -4575,10 +4671,18 @@ mod tests {
            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
            tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&CancellationToken::new(), &ctx).await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
            tline.gc().await?;
        }

@@ -4731,7 +4835,7 @@ mod tests {
            // Keeps uninit mark in place
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown(false)
+                .shutdown()
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                let mut sz: u16 = rng.gen();
                // Make 50% of the arrays small
                if rng.gen() {
-                    sz |= 63;
+                    sz &= 63;
                }
                random_array(sz.into())
            })
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -4,7 +4,6 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemotePath};
-use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, warn, Instrument, Span};

@@ -39,9 +38,6 @@ pub(crate) enum DeleteTenantError {
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

-    #[error("Tenant deletion is already in progress")]
-    AlreadyInProgress,
-
    #[error("Tenant map slot error {0}")]
    SlotError(#[from] TenantSlotError),

@@ -55,8 +51,6 @@ pub(crate) enum DeleteTenantError {
    Other(#[from] anyhow::Error),
 }

-type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
-
 fn remote_tenant_delete_mark_path(
    conf: &PageServerConf,
    tenant_id: &TenantId,
@@ -287,14 +281,14 @@ impl DeleteTenantFlow {
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

-        let mut guard = Self::prepare(&tenant).await?;
+        Self::prepare(&tenant).await?;

-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+        if let Err(e) = Self::run_inner(conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
            return Err(e);
        }

-        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+        Self::schedule_background(conf, remote_storage, tenants, tenant);

        Ok(())
    }
@@ -304,13 +298,10 @@ impl DeleteTenantFlow {
    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
    // So the solution is to set tenant state to broken.
    async fn run_inner(
-        guard: &mut OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
    ) -> Result<(), DeleteTenantError> {
-        guard.mark_in_progress()?;
-
        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
            Err(anyhow::anyhow!(
                "failpoint: tenant-delete-before-create-remote-mark"
@@ -345,46 +336,25 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_mark_exists: bool,
        tenant: &Tenant,
-    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
-        let acquire = |t: &Tenant| {
-            Some(
-                Arc::clone(&t.delete_progress)
-                    .try_lock_owned()
-                    .expect("we're the only owner during init"),
-            )
-        };
-
+    ) -> Result<bool, DeleteTenantError> {
        if remote_mark_exists {
-            return Ok(acquire(tenant));
+            return Ok(true);
        }

        let tenant_id = tenant.tenant_id;
        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
-            Ok(acquire(tenant))
+            Ok(true)
        } else {
-            Ok(None)
+            Ok(false)
        }
    }

    pub(crate) async fn resume_from_attach(
-        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
@@ -403,19 +373,10 @@ impl DeleteTenantFlow {
            .await
            .context("attach")?;

-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
+        Self::background(tenant.conf, tenant.remote_storage.clone(), tenants, tenant).await
    }

-    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+    async fn prepare(tenant: &Arc<Tenant>) -> Result<(), DeleteTenantError> {
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -426,10 +387,6 @@ impl DeleteTenantFlow {
            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
        }

-        let guard = Arc::clone(&tenant.delete_progress)
-            .try_lock_owned()
-            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
-
        fail::fail_point!("tenant-delete-before-shutdown", |_| {
            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
        });
@@ -449,11 +406,10 @@ impl DeleteTenantFlow {
            )));
        }

-        Ok(guard)
+        Ok(())
    }

    fn schedule_background(
-        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
@@ -469,9 +425,7 @@ impl DeleteTenantFlow {
            "tenant_delete",
            false,
            async move {
-                if let Err(err) =
-                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
-                {
+                if let Err(err) = Self::background(conf, remote_storage, tenants, &tenant).await {
                    error!("Error: {err:#}");
                    tenant.set_broken(format!("{err:#}")).await;
                };
@@ -486,7 +440,6 @@ impl DeleteTenantFlow {
    }

    async fn background(
-        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
@@ -550,8 +503,6 @@ impl DeleteTenantFlow {
                .set(locked.len() as u64);
        }

-        *guard = Self::Finished;
-
        Ok(())
    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -573,10 +573,10 @@ impl<const L: usize> BuildNode<L> {
        BuildNode {
            num_children: 0,
            level,
-            prefix: Vec::with_capacity(16),
+            prefix: Vec::new(),
            suffix_len: 0,
-            keys: Vec::with_capacity(5024),
-            values: Vec::with_capacity(3140),
+            keys: Vec::new(),
+            values: Vec::new(),
            size: NODE_HDR_SIZE,
        }
    }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -190,6 +190,7 @@ use chrono::{NaiveDateTime, Utc};

 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
+pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -249,6 +250,8 @@ pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
 // retries. Uploads and deletions are retried forever, though.
 pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

+pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
+
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -816,7 +819,7 @@ impl RemoteTimelineClient {
        let mut receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            self.schedule_barrier(upload_queue)
+            self.schedule_barrier0(upload_queue)
        };

        if receiver.changed().await.is_err() {
@@ -825,7 +828,14 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    fn schedule_barrier(
+    pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        self.schedule_barrier0(upload_queue);
+        Ok(())
+    }
+
+    fn schedule_barrier0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
    ) -> tokio::sync::watch::Receiver<()> {
@@ -1229,16 +1239,18 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(delete) => self
-                    .deletion_queue_client
-                    .push_layers(
-                        self.tenant_id,
-                        self.timeline_id,
-                        self.generation,
-                        delete.layers.clone(),
-                    )
-                    .await
-                    .map_err(|e| anyhow::anyhow!(e)),
+                UploadOp::Delete(delete) => {
+                    pausable_failpoint!("before-delete-layer-pausable");
+                    self.deletion_queue_client
+                        .push_layers(
+                            self.tenant_id,
+                            self.timeline_id,
+                            self.generation,
+                            delete.layers.clone(),
+                        )
+                        .await
+                        .map_err(|e| anyhow::anyhow!(e))
+                }
                UploadOp::Barrier(_) => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
@@ -1528,6 +1540,13 @@ pub fn remote_layer_path(
    RemotePath::from_string(&path).expect("Failed to construct path")
 }

+pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
+    ))
+    .expect("Failed to construct path")
+}
+
 pub fn remote_index_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -128,6 +128,14 @@ impl IndexPart {
    pub fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn
    }
+
+    pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice::<IndexPart>(bytes)
+    }
+
+    pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
+        serde_json::to_vec(self)
+    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -201,7 +209,7 @@ mod tests {
            deleted_at: None,
        };

-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -239,7 +247,7 @@ mod tests {
            deleted_at: None,
        };

-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -279,7 +287,7 @@ mod tests {
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };

-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -323,7 +331,7 @@ mod tests {
            deleted_at: None,
        };

-        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
+        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();

        assert_eq!(empty_layers_parsed, expected);
    }
@@ -361,7 +369,7 @@ mod tests {
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };

-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,6 +1,7 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
+use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use std::io::ErrorKind;
@@ -9,7 +10,9 @@ use tokio::fs;
 use super::Generation;
 use crate::{
    config::PageServerConf,
-    tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
+    tenant::remote_timeline_client::{
+        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
+    },
 };
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
@@ -33,8 +36,9 @@ pub(super) async fn upload_index_part<'a>(
    });
    pausable_failpoint!("before-upload-index-pausable");

-    let index_part_bytes =
-        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
+    let index_part_bytes = index_part
+        .to_s3_bytes()
+        .context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

@@ -103,3 +107,22 @@ pub(super) async fn upload_timeline_layer<'a>(

    Ok(())
 }
+
+/// Uploads the given `initdb` data to the remote storage.
+pub(crate) async fn upload_initdb_dir(
+    storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+    initdb_dir: Bytes,
+) -> anyhow::Result<()> {
+    tracing::trace!("uploading initdb dir");
+
+    let size = initdb_dir.len();
+    let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
+
+    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
+    storage
+        .upload_storage_object(bytes, size, &remote_path)
+        .await
+        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+}
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -114,11 +114,12 @@ pub(super) async fn gather_inputs(
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
+    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
    tenant
-        .refresh_gc_info(ctx)
+        .refresh_gc_info(cancel, ctx)
        .await
        .context("Failed to refresh gc_info before gathering inputs")?;

@@ -350,10 +351,6 @@ async fn fill_logical_sizes(
    // our advantage with `?` error handling.
    let mut joinset = tokio::task::JoinSet::new();

-    let cancel = tokio_util::sync::CancellationToken::new();
-    // be sure to cancel all spawned tasks if we are dropped
-    let _dg = cancel.clone().drop_guard();
-
    // For each point that would benefit from having a logical size available,
    // spawn a Task to fetch it, unless we have it cached already.
    for seg in segments.iter() {
@@ -371,15 +368,8 @@ async fn fill_logical_sizes(
                let parallel_size_calcs = Arc::clone(limit);
                let ctx = ctx.attached_child();
                joinset.spawn(
-                    calculate_logical_size(
-                        parallel_size_calcs,
-                        timeline,
-                        lsn,
-                        cause,
-                        ctx,
-                        cancel.child_token(),
-                    )
-                    .in_current_span(),
+                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
+                        .in_current_span(),
                );
            }
            e.insert(cached_size);
@@ -487,14 +477,13 @@ async fn calculate_logical_size(
    lsn: utils::lsn::Lsn,
    cause: LogicalSizeCalculationCause,
    ctx: RequestContext,
-    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
        .await
        .expect("global semaphore should not had been closed");

    let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
        .await?;
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -289,7 +289,9 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
+        let loaded = DeltaLayerInner::load(&path, None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_filename = path.file_name().unwrap().to_owned();
@@ -610,18 +612,28 @@ impl Drop for DeltaLayerWriter {
 }

 impl DeltaLayerInner {
+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .await
-            .with_context(|| format!("Failed to open file '{path}'"))?;
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0, ctx).await?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let summary_blk = match file.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };
+
+        // TODO: this should be an assertion instead; see ImageLayerInner::load
+        let actual_summary =
+            Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;

        if let Some(mut expected_summary) = summary {
            // production code path
@@ -636,11 +648,11 @@ impl DeltaLayerInner {
            }
        }

-        Ok(DeltaLayerInner {
+        Ok(Ok(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        })
+        }))
    }

    pub(super) async fn get_value_reconstruct_data(
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -249,7 +249,9 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_filename = path.file_name().unwrap().to_owned();
@@ -295,18 +297,31 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        lsn: Lsn,
        summary: Option<Summary>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .await
-            .with_context(|| format!("Failed to open file '{}'", path))?;
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let summary_blk = match file.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };
+
+        // length is the only way how this could fail, so it's not actually likely at all unless
+        // read_blk returns wrong sized block.
+        //
+        // TODO: confirm and make this into assertion
+        let actual_summary =
+            Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;

        if let Some(mut expected_summary) = summary {
            // production code path
@@ -322,12 +337,12 @@ impl ImageLayerInner {
            }
        }

-        Ok(ImageLayerInner {
+        Ok(Ok(ImageLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            lsn,
            file,
-        })
+        }))
    }

    pub(super) async fn get_value_reconstruct_data(
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -251,6 +251,7 @@ impl Layer {

        layer
            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
            .await
    }

@@ -867,6 +868,9 @@ impl LayerInner {
            }
            Ok((Err(e), _permit)) => {
                // FIXME: this should be with the spawned task and be cancellation sensitive
+                //
+                // while we should not need this, this backoff has turned out to be useful with
+                // a bug of unexpectedly deleted remote layer file (#5787).
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -1195,7 +1199,7 @@ impl DownloadedLayer {
                ));
                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
                    .await
-                    .map(LayerKind::Delta)
+                    .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
@@ -1206,21 +1210,32 @@ impl DownloadedLayer {
                ));
                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
                    .await
-                    .map(LayerKind::Image)
-            }
-            // this will be a permanent failure
-            .context("load layer");
+                    .map(|res| res.map(LayerKind::Image))
+            };

-            if res.is_err() {
-                LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+            match res {
+                Ok(Ok(layer)) => Ok(Ok(layer)),
+                Ok(Err(transient)) => Err(transient),
+                Err(permanent) => {
+                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                    // TODO(#5815): we are not logging all errors, so temporarily log them **once**
+                    // here as well
+                    let permanent = permanent.context("load layer");
+                    tracing::error!("layer loading failed permanently: {permanent:#}");
+                    Ok(Err(permanent))
+                }
            }
-            res
        };
-        self.kind.get_or_init(init).await.as_ref().map_err(|e| {
-            // errors are not clonabled, cannot but stringify
-            // test_broken_timeline matches this string
-            anyhow::anyhow!("layer loading failed: {e:#}")
-        })
+        self.kind
+            .get_or_try_init(init)
+            // return transient errors using `?`
+            .await?
+            .as_ref()
+            .map_err(|e| {
+                // errors are not clonabled, cannot but stringify
+                // test_broken_timeline matches this string
+                anyhow::anyhow!("layer loading failed: {e:#}")
+            })
    }

    async fn get_value_reconstruct_data(
@@ -1291,6 +1306,7 @@ impl ResidentLayer {
    }

    /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    #[tracing::instrument(skip_all, fields(layer=%self))]
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -180,16 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
+                        error_run_count + 1,
                        1.0,
                        MAX_BACKOFF_SECS,
                    );
                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
-                        wait_duration
+                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                    );
-                    Duration::from_secs_f64(wait_duration)
+                    wait_duration
                } else {
                    error_run_count = 0;
                    period
@@ -198,6 +198,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);

+            // Perhaps we did no work and the walredo process has been idle for some time:
+            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            tenant.walredo_mgr.maybe_quiesce(period * 10);
+
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
@@ -257,20 +261,20 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run gc
                let res = tenant
-                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
                if let Err(e) = res {
                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
+                        error_run_count + 1,
                        1.0,
                        MAX_BACKOFF_SECS,
                    );
                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
                    error!(
-                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
-                        wait_duration
+                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                    );
-                    Duration::from_secs_f64(wait_duration)
+                    wait_duration
                } else {
                    error_run_count = 0;
                    period
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -10,6 +10,7 @@ mod walreceiver;
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
+use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::models::{
@@ -36,7 +37,6 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -50,6 +50,7 @@ use crate::tenant::{
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
 };
+use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -247,7 +248,7 @@ pub struct Timeline {
    /// the flush finishes. You can use that to wait for the flush to finish.
    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
-    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -374,6 +375,19 @@ pub enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

+#[derive(thiserror::Error, Debug)]
+enum FlushLayerError {
+    /// Timeline cancellation token was cancelled
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error(transparent)]
+    PageReconstructError(#[from] PageReconstructError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 impl std::fmt::Debug for PageReconstructError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
@@ -424,6 +438,11 @@ pub enum LogicalSizeCalculationCause {
    TenantSizeHandler,
 }

+#[derive(enumset::EnumSetType)]
+pub(crate) enum CompactFlags {
+    ForceRepartition,
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -681,6 +700,7 @@ impl Timeline {
    pub(crate) async fn compact(
        self: &Arc<Self>,
        cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
        // this wait probably never needs any "long time spent" logging, because we already nag if
@@ -753,6 +773,7 @@ impl Timeline {
            .repartition(
                self.get_last_record_lsn(),
                self.get_compaction_target_size(),
+                flags,
                ctx,
            )
            .await
@@ -891,15 +912,16 @@ impl Timeline {
        self.launch_eviction_task(background_jobs_can_start);
    }

+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    ///
+    /// While we are flushing, we continue to accept read I/O.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
+    pub(crate) async fn flush_and_shutdown(&self) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Signal any subscribers to our cancellation token to drop out
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
-        // prevent writes to the InMemoryLayer
+        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
+        // trying to flush
        tracing::debug!("Waiting for WalReceiverManager...");
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
@@ -908,40 +930,70 @@ impl Timeline {
        )
        .await;

+        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        self.last_record_lsn.shutdown();
+
        // now all writers to InMemory layer are gone, do the final flush if requested
-        if freeze_and_flush {
-            match self.freeze_and_flush().await {
-                Ok(()) => {}
-                Err(e) => {
-                    warn!("failed to freeze and flush: {e:#}");
-                    return; // TODO: should probably drain remote timeline client anyways?
+        match self.freeze_and_flush().await {
+            Ok(_) => {
+                // drain the upload queue
+                if let Some(client) = self.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    if let Err(e) = client.wait_completion().await {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to flush to remote storage: {e:#}");
+                    }
                }
            }
-
-            // drain the upload queue
-            let res = if let Some(client) = self.remote_client.as_ref() {
-                // if we did not wait for completion here, it might be our shutdown process
-                // didn't wait for remote uploads to complete at all, as new tasks can forever
-                // be spawned.
-                //
-                // what is problematic is the shutting down of RemoteTimelineClient, because
-                // obviously it does not make sense to stop while we wait for it, but what
-                // about corner cases like s3 suddenly hanging up?
-                client.wait_completion().await
-            } else {
-                Ok(())
-            };
-
-            if let Err(e) = res {
-                warn!("failed to await for frozen and flushed uploads: {e:#}");
+            Err(e) => {
+                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                // we have some extra WAL replay to do next time the timeline starts.
+                warn!("failed to freeze and flush: {e:#}");
            }
        }

+        self.shutdown().await;
+    }
+
+    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
+    /// the graceful [`Timeline::flush_and_shutdown`] function.
+    pub(crate) async fn shutdown(&self) {
+        // Signal any subscribers to our cancellation token to drop out
+        tracing::debug!("Cancelling CancellationToken");
+        self.cancel.cancel();
+
        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
        // while doing so.
        self.last_record_lsn.shutdown();

+        // Shut down the layer flush task before the remote client, as one depends on the other
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::LayerFlushTask),
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
+        // case our caller wants to use that for a deletion
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            match remote_client.stop() {
+                Ok(()) => {}
+                Err(StopError::QueueUninitialized) => {
+                    // Shutting down during initialization is legal
+                }
+            }
+        }
+
        tracing::debug!("Waiting for tasks...");
+
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;

        // Finally wait until any gate-holders are complete
@@ -985,7 +1037,12 @@ impl Timeline {
            reason,
            backtrace: backtrace_str,
        };
-        self.set_state(broken_state)
+        self.set_state(broken_state);
+
+        // Although the Broken state is not equivalent to shutdown() (shutdown will be called
+        // later when this tenant is detach or the process shuts down), firing the cancellation token
+        // here avoids the need for other tasks to watch for the Broken state explicitly.
+        self.cancel.cancel();
    }

    pub fn current_state(&self) -> TimelineState {
@@ -1662,6 +1719,30 @@ impl Timeline {
        if let Some(rtc) = self.remote_client.as_ref() {
            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
+            // This barrier orders above DELETEs before any later operations.
+            // This is critical because code executing after the barrier might
+            // create again objects with the same key that we just scheduled for deletion.
+            // For example, if we just scheduled deletion of an image layer "from the future",
+            // later compaction might run again and re-create the same image layer.
+            // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
+            // "same" here means same key range and LSN.
+            //
+            // Without a barrier between above DELETEs and the re-creation's PUTs,
+            // the upload queue may execute the PUT first, then the DELETE.
+            // In our example, we will end up with an IndexPart referencing a non-existent object.
+            //
+            // 1. a future image layer is created and uploaded
+            // 2. ps restart
+            // 3. the future layer from (1) is deleted during load layer map
+            // 4. image layer is re-created and uploaded
+            // 5. deletion queue would like to delete (1) but actually deletes (4)
+            // 6. delete by name works as expected, but it now deletes the wrong (later) version
+            //
+            // See https://github.com/neondatabase/neon/issues/5878
+            //
+            // NB: generation numbers naturally protect against this because they disambiguate
+            //     (1) and (4)
+            rtc.schedule_barrier()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
            // on retry.
        }
@@ -1741,12 +1822,8 @@ impl Timeline {
                // delay will be terminated by a timeout regardless.
                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };

-                // no extra cancellation here, because nothing really waits for this to complete compared
-                // to spawn_ondemand_logical_size_calculation.
-                let cancel = CancellationToken::new();
-
                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
                    .await
                {
                    Ok(s) => s,
@@ -1815,7 +1892,6 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: RequestContext,
-        cancel: CancellationToken,
    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
        let (sender, receiver) = oneshot::channel();
        let self_clone = Arc::clone(self);
@@ -1836,7 +1912,7 @@ impl Timeline {
            false,
            async move {
                let res = self_clone
-                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
+                    .logical_size_calculation_task(lsn, cause, &ctx)
                    .await;
                let _ = sender.send(res).ok();
                Ok(()) // Receiver is responsible for handling errors
@@ -1852,58 +1928,28 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
-        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
        span::debug_assert_current_span_has_tenant_and_timeline_id();

-        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let _guard = self.gate.enter();
+
        let self_calculation = Arc::clone(self);

        let mut calculation = pin!(async {
-            let cancel = cancel.child_token();
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cause, cancel, &ctx)
+                .calculate_logical_size(lsn, cause, &ctx)
                .await
        });
-        let timeline_state_cancellation = async {
-            loop {
-                match timeline_state_updates.changed().await {
-                    Ok(()) => {
-                        let new_state = timeline_state_updates.borrow().clone();
-                        match new_state {
-                            // we're running this job for active timelines only
-                            TimelineState::Active => continue,
-                            TimelineState::Broken { .. }
-                            | TimelineState::Stopping
-                            | TimelineState::Loading => {
-                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
-                            }
-                        }
-                    }
-                    Err(_sender_dropped_error) => {
-                        // can't happen, the sender is not dropped as long as the Timeline exists
-                        break "aborted because state watch was dropped".to_string();
-                    }
-                }
-            }
-        };
-
-        let taskmgr_shutdown_cancellation = async {
-            task_mgr::shutdown_watcher().await;
-            "aborted because task_mgr shutdown requested".to_string()
-        };

        tokio::select! {
            res = &mut calculation => { res }
-            reason = timeline_state_cancellation => {
-                debug!(reason = reason, "cancelling calculation");
-                cancel.cancel();
+            _ = self.cancel.cancelled() => {
+                debug!("cancelling logical size calculation for timeline shutdown");
                calculation.await
            }
-            reason = taskmgr_shutdown_cancellation => {
-                debug!(reason = reason, "cancelling calculation");
-                cancel.cancel();
+            _ = task_mgr::shutdown_watcher() => {
+                debug!("cancelling logical size calculation for task shutdown");
                calculation.await
            }
        }
@@ -1917,7 +1963,6 @@ impl Timeline {
        &self,
        up_to_lsn: Lsn,
        cause: LogicalSizeCalculationCause,
-        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        info!(
@@ -1960,7 +2005,7 @@ impl Timeline {
        };
        let timer = storage_time_metrics.start_timer();
        let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
+            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
            .await?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -2373,6 +2418,10 @@ impl Timeline {
        info!("started flush loop");
        loop {
            tokio::select! {
+                _ = self.cancel.cancelled() => {
+                    info!("shutting down layer flush task");
+                    break;
+                },
                _ = task_mgr::shutdown_watcher() => {
                    info!("shutting down layer flush task");
                    break;
@@ -2384,6 +2433,14 @@ impl Timeline {
            let timer = self.metrics.flush_time_histo.start_timer();
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
+                if self.cancel.is_cancelled() {
+                    info!("dropping out of flush loop for timeline shutdown");
+                    // Note: we do not bother transmitting into [`layer_flush_done_tx`], because
+                    // anyone waiting on that will respect self.cancel as well: they will stop
+                    // waiting at the same time we as drop out of this loop.
+                    return;
+                }
+
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    guard.layer_map().frozen_layers.front().cloned()
@@ -2392,9 +2449,18 @@ impl Timeline {
                let Some(layer_to_flush) = layer_to_flush else {
                    break Ok(());
                };
-                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    error!("could not flush frozen layer: {err:?}");
-                    break Err(err);
+                match self.flush_frozen_layer(layer_to_flush, ctx).await {
+                    Ok(()) => {}
+                    Err(FlushLayerError::Cancelled) => {
+                        info!("dropping out of flush loop for timeline shutdown");
+                        return;
+                    }
+                    err @ Err(
+                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
+                    ) => {
+                        error!("could not flush frozen layer: {err:?}");
+                        break err;
+                    }
                }
            };
            // Notify any listeners that we're done
@@ -2443,7 +2509,17 @@ impl Timeline {
                }
            }
            trace!("waiting for flush to complete");
-            rx.changed().await?;
+            tokio::select! {
+                rx_e = rx.changed() => {
+                    rx_e?;
+                },
+                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
+                // the notification from [`flush_loop`] that it completed.
+                _ = self.cancel.cancelled() => {
+                    tracing::info!("Cancelled layer flush due on timeline shutdown");
+                    return Ok(())
+                }
+            };
            trace!("done")
        }
    }
@@ -2458,7 +2534,7 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), FlushLayerError> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -2481,8 +2557,18 @@ impl Timeline {
                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
                // require downloading anything during initial import.
                let (partitioning, _lsn) = self
-                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
+                    .repartition(
+                        self.initdb_lsn,
+                        self.get_compaction_target_size(),
+                        EnumSet::empty(),
+                        ctx,
+                    )
                    .await?;
+
+                if self.cancel.is_cancelled() {
+                    return Err(FlushLayerError::Cancelled);
+                }
+
                // For image layers, we add them immediately into the layer map.
                (
                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
@@ -2514,6 +2600,12 @@ impl Timeline {
                )
            };

+        pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
+
+        if self.cancel.is_cancelled() {
+            return Err(FlushLayerError::Cancelled);
+        }
+
        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();

@@ -2523,6 +2615,10 @@ impl Timeline {
        let metadata = {
            let mut guard = self.layers.write().await;

+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);

            if disk_consistent_lsn != old_disk_consistent_lsn {
@@ -2687,12 +2783,16 @@ impl Timeline {
        &self,
        lsn: Lsn,
        partition_size: u64,
+        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
        {
            let partitioning_guard = self.partitioning.lock().unwrap();
            let distance = lsn.0 - partitioning_guard.1 .0;
-            if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
+            if partitioning_guard.1 != Lsn(0)
+                && distance <= self.repartition_threshold
+                && !flags.contains(CompactFlags::ForceRepartition)
+            {
                debug!(
                    distance,
                    threshold = self.repartition_threshold,
@@ -3440,21 +3540,22 @@ impl Timeline {
            }

            // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-            let mut layer_paths: Vec<Utf8PathBuf> = new_layers
+            let layer_paths: Vec<Utf8PathBuf> = new_layers
                .iter()
                .map(|l| l.local_path().to_owned())
                .collect();

            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
-            //
-            // FIXME: spawn_blocking above for this
-            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
+            par_fsync::par_fsync_async(&layer_paths)
+                .await
+                .context("fsync all new layers")?;

-            par_fsync::par_fsync(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
+            let timeline_dir = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
+
+            par_fsync::par_fsync_async(&[timeline_dir])
+                .await
                .context("fsync of timeline dir")?;
-
-            layer_paths.pop().unwrap();
        }

        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
@@ -3627,6 +3728,7 @@ impl Timeline {
        retain_lsns: Vec<Lsn>,
        cutoff_horizon: Lsn,
        pitr: Duration,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
@@ -3640,7 +3742,10 @@ impl Timeline {
            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);

-                match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
+                match self
+                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
+                    .await?
+                {
                    LsnForTimestamp::Present(lsn) => lsn,
                    LsnForTimestamp::Future(lsn) => {
                        // The timestamp is in the future. That sounds impossible,
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -110,35 +110,6 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
    Ok(())
 }

-// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
-// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
-// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
-// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
-// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
-// So we can just remove the mark file.
-async fn create_delete_mark(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> Result<(), DeleteTimelineError> {
-    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-delete-mark"
-        ))?
-    });
-    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-    Ok(())
-}
-
 /// Grab the layer_removal_cs lock, and actually perform the deletion.
 ///
 /// This lock prevents prevents GC or compaction from running at the same time.
@@ -311,6 +282,8 @@ async fn cleanup_remaining_timeline_fs_traces(
        .context("fsync_pre_mark_remove")?;

    // Remove delete mark
+    // TODO: once we are confident that no more exist in the field, remove this
+    // line.  It cleans up a legacy marker file that might in rare cases be present.
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
@@ -391,8 +364,6 @@ impl DeleteTimelineFlow {

        set_deleted_in_remote_index(&timeline).await?;

-        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
-
        fail::fail_point!("timeline-delete-before-schedule", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-schedule"
@@ -464,10 +435,6 @@ impl DeleteTimelineFlow {

        guard.mark_in_progress()?;

-        // Note that delete mark can be missing on resume
-        // because we create delete mark after we set deleted_at in the index part.
-        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
-
        Self::schedule_background(guard, tenant.conf, tenant, timeline);

        Ok(())
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};

 use crate::{
    context::{DownloadBehavior, RequestContext},
+    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -326,8 +327,7 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
-                    .await;
+                self.imitate_timeline_cached_layer_accesses(ctx).await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
            }
        }
@@ -351,7 +351,7 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
+                self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now());
            }
@@ -367,21 +367,12 @@ impl Timeline {

    /// Recompute the values which would cause on-demand downloads during restart.
    #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(
-        &self,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) {
+    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
        let lsn = self.get_last_record_lsn();

        // imitiate on-restart initial logical size
        let size = self
-            .calculate_logical_size(
-                lsn,
-                LogicalSizeCalculationCause::EvictionTaskImitation,
-                cancel.clone(),
-                ctx,
-            )
+            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
            .instrument(info_span!("calculate_logical_size"))
            .await;

@@ -407,9 +398,16 @@ impl Timeline {
            if size.is_err() {
                // ignore, see above comment
            } else {
-                warn!(
-                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
-                );
+                match e {
+                    CollectKeySpaceError::Cancelled => {
+                        // Shutting down, ignore
+                    }
+                    err => {
+                        warn!(
+                            "failed to collect keyspace but succeeded in calculating logical size: {err:#}"
+                        );
+                    }
+                }
            }
        }
    }
@@ -419,8 +417,8 @@ impl Timeline {
    async fn imitate_synthetic_size_calculation_worker(
        &self,
        tenant: &Arc<Tenant>,
-        ctx: &RequestContext,
        cancel: &CancellationToken,
+        ctx: &RequestContext,
    ) {
        if self.conf.metric_collection_endpoint.is_none() {
            // We don't start the consumption metrics task if this is not set in the config.
@@ -459,6 +457,7 @@ impl Timeline {
            None,
            &mut throwaway_cache,
            LogicalSizeCalculationCause::EvictionTaskImitation,
+            cancel,
            ctx,
        )
        .instrument(info_span!("gather_inputs"));
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -45,12 +45,20 @@ impl<'t> UninitializedTimeline<'t> {
        let timeline_id = self.timeline_id;
        let tenant_id = self.owning_tenant.tenant_id;

-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
-        })?;
+        if self.raw_timeline.is_none() {
+            return Err(anyhow::anyhow!(
+                "No timeline for initialization found for {tenant_id}/{timeline_id}"
+            ));
+        }

        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
+        let new_disk_consistent_lsn = self
+            .raw_timeline
+            .as_ref()
+            .expect("checked above")
+            .0
+            .get_disk_consistent_lsn();
+
        anyhow::ensure!(
            new_disk_consistent_lsn.is_valid(),
            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
@@ -62,6 +70,13 @@ impl<'t> UninitializedTimeline<'t> {
                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
            ),
            Entry::Vacant(v) => {
+                // after taking here should be no fallible operations, because the drop guard will not
+                // cleanup after and would block for example the tenant deletion
+                let (new_timeline, uninit_mark) =
+                    self.raw_timeline.take().expect("already checked");
+
+                // this is the mutual exclusion between different retries to create the timeline;
+                // this should be an assertion.
                uninit_mark.remove_uninit_mark().with_context(|| {
                    format!(
                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
@@ -70,10 +85,10 @@ impl<'t> UninitializedTimeline<'t> {
                v.insert(Arc::clone(&new_timeline));

                new_timeline.maybe_spawn_flush_loop();
+
+                Ok(new_timeline)
            }
        }
-
-        Ok(new_timeline)
    }

    /// Prepares timeline data by loading it from the basebackup archive.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,8 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};

 use crate::config::PageServerConf;
 use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
-    WAL_REDO_WAIT_TIME,
+    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -91,6 +91,7 @@ struct ProcessOutput {
 pub struct PostgresRedoManager {
    tenant_id: TenantId,
    conf: &'static PageServerConf,
+    last_redo_at: std::sync::Mutex<Option<Instant>>,
    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
 }

@@ -187,10 +188,26 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_id,
            conf,
+            last_redo_at: std::sync::Mutex::default(),
            redo_process: RwLock::new(None),
        }
    }

+    /// This type doesn't have its own background task to check for idleness: we
+    /// rely on our owner calling this function periodically in its own housekeeping
+    /// loops.
+    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
+        if let Ok(g) = self.last_redo_at.try_lock() {
+            if let Some(last_redo_at) = *g {
+                if last_redo_at.elapsed() >= idle_timeout {
+                    drop(g);
+                    let mut guard = self.redo_process.write().unwrap();
+                    *guard = None;
+                }
+            }
+        }
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -205,13 +222,12 @@ impl PostgresRedoManager {
        wal_redo_timeout: Duration,
        pg_version: u32,
    ) -> anyhow::Result<Bytes> {
+        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
+
        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
-        let start_time = Instant::now();
        let mut n_attempts = 0u32;
        loop {
-            let lock_time = Instant::now();
-
            // launch the WAL redo process on first use
            let proc: Arc<WalRedoProcess> = {
                let proc_guard = self.redo_process.read().unwrap();
@@ -236,7 +252,7 @@ impl PostgresRedoManager {
                }
            };

-            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            let started_at = std::time::Instant::now();

            // Relational WAL records are applied using wal-redo-postgres
            let buf_tag = BufferTag { rel, blknum };
@@ -244,8 +260,7 @@ impl PostgresRedoManager {
                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
                .context("apply_wal_records");

-            let end_time = Instant::now();
-            let duration = end_time.duration_since(lock_time);
+            let duration = started_at.elapsed();

            let len = records.len();
            let nbytes = records.iter().fold(0, |acumulator, record| {
@@ -352,12 +367,13 @@ impl PostgresRedoManager {
            self.apply_record_neon(key, &mut page, *record_lsn, record)?;
        }
        // Success!
-        let end_time = Instant::now();
-        let duration = end_time.duration_since(start_time);
+        let duration = start_time.elapsed();
+        // FIXME: using the same metric here creates a bimodal distribution by default, and because
+        // there could be multiple batch sizes this would be N+1 modal.
        WAL_REDO_TIME.observe(duration.as_secs_f64());

        debug!(
-            "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "neon applied {} WAL records in {} us to reconstruct page image at LSN {}",
            records.len(),
            duration.as_micros(),
            lsn
@@ -596,21 +612,21 @@ trait CloseFileDescriptors: CommandExt {

 impl<C: CommandExt> CloseFileDescriptors for C {
    fn close_fds(&mut self) -> &mut Command {
+        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
+        // which means it should be safe to execute inside a signal handler.
+        // The precise meaning depends on platform. See `man signal-safety`
+        // for the linux definition.
+        //
+        // The set_fds_cloexec_threadsafe function is documented to be
+        // async-signal-safe.
+        //
+        // Aside from this function, the rest of the code is re-entrant and
+        // doesn't make any syscalls. We're just passing constants.
+        //
+        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
+        // which is not async-signal-safe. Be careful.
        unsafe {
            self.pre_exec(move || {
-                // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-                // which means it should be safe to execute inside a signal handler.
-                // The precise meaning depends on platform. See `man signal-safety`
-                // for the linux definition.
-                //
-                // The set_fds_cloexec_threadsafe function is documented to be
-                // async-signal-safe.
-                //
-                // Aside from this function, the rest of the code is re-entrant and
-                // doesn't make any syscalls. We're just passing constants.
-                //
-                // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-                // which is not async-signal-safe. Be careful.
                close_fds::set_fds_cloexec_threadsafe(3, &[]);
                Ok(())
            })
@@ -667,10 +683,10 @@ impl WalRedoProcess {
            .close_fds()
            .spawn_no_leak_child(tenant_id)
            .context("spawn process")?;
-
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait();
+            child.kill_and_wait(WalRedoKillCause::Startup);
        });

        let stdin = child.stdin.take().unwrap();
@@ -1001,7 +1017,7 @@ impl Drop for WalRedoProcess {
        self.child
            .take()
            .expect("we only do this once")
-            .kill_and_wait();
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
@@ -1037,16 +1053,19 @@ impl NoLeakChild {
        })
    }

-    fn kill_and_wait(mut self) {
+    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
        let child = match self.child.take() {
            Some(child) => child,
            None => return,
        };
-        Self::kill_and_wait_impl(child);
+        Self::kill_and_wait_impl(child, cause);
    }

-    #[instrument(skip_all, fields(pid=child.id()))]
-    fn kill_and_wait_impl(mut child: Child) {
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
        let res = child.kill();
        if let Err(e) = res {
            // This branch is very unlikely because:
@@ -1091,7 +1110,7 @@ impl Drop for NoLeakChild {
                // This thread here is going to outlive of our dropper.
                let span = tracing::info_span!("walredo", %tenant_id);
                let _entered = span.enter();
-                Self::kill_and_wait_impl(child);
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
            })
            .await
        });
--- a/Show More
+++ b/Show More