Reduce number of iteration in test_physical_replication to reduce test time

Fix test_physical_replication test taken in acount autocommit behaviour of psycopg
refactor(pageserver): use tokio::signal instead of spawn_blocking (#7332 )
2026-03-04 00:40:38 +00:00 · 2024-05-22 11:52:31 +03:00 · 2024-04-08 17:37:48 +03:00 · 2024-04-08 09:35:32 +00:00 · 2024-04-08 09:01:38 +03:00 · 2024-04-07 21:21:18 +00:00
161 changed files with 5884 additions and 1906 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,6 +22,7 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,15 +147,16 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'
        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -171,7 +172,7 @@ jobs:
        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                   { "platform": "rds-aurora"   }]')
+                                                     { "platform": "rds-aurora"   }]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -190,7 +191,7 @@ jobs:
        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+                                                     { "platform": "rds-aurora",   "scale": "10" }]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -253,6 +254,9 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -270,11 +274,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -401,11 +409,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -507,11 +519,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -597,11 +613,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1127,6 +1127,7 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
@@ -1136,6 +1137,7 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1144,6 +1146,7 @@ jobs:
              -f deployProxy=true \
              -f deployStorage=false \
              -f deployStorageBroker=false \
              -f deployStorageController=false \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:
  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: ubuntu-latest
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
    steps:
      - name: check if ecr image are present
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,41 +79,55 @@ jobs:
            fi
          done
-      - name: Set PR's status to pending and request a remote CI test
+      - name: Set e2e-platforms
        id: e2e-platforms
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # Default set of platforms to run e2e tests on
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          platforms='["docker", "k8s"]'
          # to place a job run status update later.
          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
          # If the workflow run is not a pull request, add k8s-neonvm to the list.
          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
              case "$f" in
                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
                  # no-op
                  ;;
              esac
            done
          else
            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
          fi
-          curl -f -X POST \
+          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
          -H "Accept: application/vnd.github.v3+json" \
          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
          --data \
            "{
              \"state\": \"pending\",
              \"context\": \"neon-cloud-e2e\",
              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
            }"
-          curl -f -X POST \
+      - name: Set PR's status to pending and request a remote CI test
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+        env:
-          -H "Accept: application/vnd.github.v3+json" \
+          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          --data \
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-            "{
+        run: |
-              \"ref\": \"main\",
+          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
-              \"inputs\": {
+
-                \"ci_job_name\": \"neon-cloud-e2e\",
+          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
-                \"commit_hash\": \"$COMMIT_SHA\",
+            --method POST \
-                \"remote_repo\": \"${{ github.repository }}\",
+            --raw-field "state=pending" \
-                \"storage_image_tag\": \"${TAG}\",
+            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
-                \"compute_image_tag\": \"${TAG}\",
+            --raw-field "context=neon-cloud-e2e"
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+
-              }
+          gh workflow --repo ${REMOTE_REPO} \
-            }"
+            run testing.yml \
              --ref "main" \
              --raw-field "ci_job_name=neon-cloud-e2e" \
              --raw-field "commit_hash=$COMMIT_SHA" \
              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
              --raw-field "storage_image_tag=${TAG}" \
              --raw-field "compute_image_tag=${TAG}" \
              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/attachment_service @neondatabase/storage
+/storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,44 +270,6 @@ dependencies = [
 "critical-section",
 ]
 [[package]]
 name = "attachment_service"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "aws-config",
 "bytes",
 "camino",
 "clap",
 "control_plane",
 "diesel",
 "diesel_migrations",
 "fail",
 "futures",
 "git-version",
 "hex",
 "humantime",
 "hyper",
 "lasso",
 "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
 "r2d2",
 "reqwest",
 "routerify",
 "serde",
 "serde_json",
 "thiserror",
 "tokio",
 "tokio-util",
 "tracing",
 "utils",
 "workspace_hack",
 ]
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -2234,9 +2196,9 @@ dependencies = [
 [[package]]
 name = "h2"
-version = "0.3.24"
+version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
 dependencies = [
 "bytes",
 "fnv",
@@ -3435,9 +3397,9 @@ dependencies = [
 [[package]]
 name = "ordered-multimap"
-version = "0.7.1"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
+checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
 "dlv-list",
 "hashbrown 0.14.0",
@@ -4199,6 +4161,7 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-compression",
 "async-trait",
 "aws-config",
 "aws-sdk-iam",
@@ -5621,6 +5584,65 @@ dependencies = [
 "workspace_hack",
 ]
 [[package]]
 name = "storage_controller"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "aws-config",
 "bytes",
 "camino",
 "clap",
 "control_plane",
 "diesel",
 "diesel_migrations",
 "fail",
 "futures",
 "git-version",
 "hex",
 "humantime",
 "hyper",
 "itertools",
 "lasso",
 "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
 "r2d2",
 "reqwest",
 "routerify",
 "serde",
 "serde_json",
 "thiserror",
 "tokio",
 "tokio-util",
 "tracing",
 "utils",
 "workspace_hack",
 ]
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "clap",
 "comfy-table",
 "hyper",
 "pageserver_api",
 "pageserver_client",
 "reqwest",
 "serde",
 "serde_json",
 "thiserror",
 "tokio",
 "tracing",
 "utils",
 "workspace_hack",
 ]
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5777,23 +5799,23 @@ dependencies = [
 [[package]]
 name = "test-context"
-version = "0.1.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
+checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
 dependencies = [
 "async-trait",
 "futures",
 "test-context-macros",
 ]
 [[package]]
 name = "test-context-macros"
-version = "0.1.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
+checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 [[package]]
@@ -5934,9 +5956,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
 dependencies = [
 "backtrace",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/attachment_service",
+    "control_plane/storcon_cli",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,6 +12,7 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -158,7 +159,7 @@ svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.1"
+test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1262,10 +1262,12 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);
-        self.ext_download_progress
+        if download_size.is_ok() {
-            .write()
+            self.ext_download_progress
-            .expect("bad lock")
+                .write()
-            .insert(ext_archive_name.to_string(), (download_start, true));
+                .expect("bad lock")
                .insert(ext_archive_name.to_string(), (download_start, true));
        }
        download_size
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
+                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -743,21 +743,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    // DISABLED due to compute node unpinning epic
+    let query = "ALTER EXTENSION neon UPDATE";
-    // let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
-    // info!("update neon extension version with query: {}", query);
+    if let Err(e) = client.simple_query(query) {
-    // client.simple_query(query)?;
+        error!(
            "failed to upgrade neon extension during `handle_extension_neon`: {}",
            e
        );
    }
    Ok(())
 }
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade (not really)");
+    info!("handle neon extension upgrade");
-    // DISABLED due to compute node unpinning epic
+    let query = "ALTER EXTENSION neon UPDATE";
-    // let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
-    // info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
    // client.simple_query(query)?;
    Ok(())
 }
@@ -806,19 +809,8 @@ $$;"#,
        "",
        "",
        "",
        "",
        // Add new migrations below.
        r#"
 DO $$
 DECLARE
    role_name TEXT;
 BEGIN
    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
    LOOP
        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
    END LOOP;
 END
 $$;"#,
    ];
    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
+use pageserver_api::controller_api::PlacementPolicy;
    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }
        Some(("set-state", subcommand_args)) => {
            let pageserver = get_pageserver(env, subcommand_args)?;
            let scheduling = subcommand_args.get_one("scheduling");
            let availability = subcommand_args.get_one("availability");
            let storage_controller = StorageController::from_env(env);
            storage_controller
                .node_configure(NodeConfigureRequest {
                    node_id: pageserver.conf.id,
                    scheduling: scheduling.cloned(),
                    availability: availability.cloned(),
                })
                .await?;
        }
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1515,12 +1498,6 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("set-state")
                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
                    .about("Set scheduling or availability state of pageserver node")
                    .arg(pageserver_config_args.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,6 +389,10 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -501,6 +505,12 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
                image_layer_creation_check_threshold: settings
                    .remove("image_layer_creation_check_threshold")
                    .map(|x| x.parse::<u8>())
                    .transpose()
                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -0,0 +1,23 @@
 [package]
 name = "storcon_cli"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 [dependencies]
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -0,0 +1,587 @@
 use std::{collections::HashMap, str::FromStr};
 use clap::{Parser, Subcommand};
 use hyper::Method;
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
 use reqwest::Url;
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 #[derive(Subcommand, Debug)]
 enum Command {
    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
    /// since pageservers auto-register when they start up
    NodeRegister {
        #[arg(long)]
        node_id: NodeId,
        #[arg(long)]
        listen_pg_addr: String,
        #[arg(long)]
        listen_pg_port: u16,
        #[arg(long)]
        listen_http_addr: String,
        #[arg(long)]
        listen_http_port: u16,
    },
    /// Modify a node's configuration in the storage controller
    NodeConfigure {
        #[arg(long)]
        node_id: NodeId,
        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
        /// manually mark a node offline
        #[arg(long)]
        availability: Option<NodeAvailabilityArg>,
        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
        tenant_id: TenantId,
        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
        /// or is in the normal attached state with N secondary locations (`attached:N`)
        #[arg(long)]
        placement: Option<PlacementPolicyArg>,
        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
        /// unavailable, and are only for use in emergencies.
        #[arg(long)]
        scheduling: Option<ShardSchedulingPolicyArg>,
    },
    /// List nodes known to the storage controller
    Nodes {},
    /// List tenants known to the storage controller
    Tenants {},
    /// Create a new tenant in the storage controller, and by extension on pageservers.
    TenantCreate {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Delete a tenant in the storage controller, and by extension on pageservers.
    TenantDelete {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Split an existing tenant into a higher number of shards than its current shard count.
    TenantShardSplit {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
        shard_count: u8,
        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
        #[arg(long)]
        stripe_size: Option<u32>,
    },
    /// Migrate the attached location for a tenant shard to a specific pageserver.
    TenantShardMigrate {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
        #[arg(long)]
        node: NodeId,
    },
    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
    TenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
        config: String,
    },
    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
    /// alternative to the storage controller's scheduling optimization behavior.
    TenantScatter {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Print details about a particular tenant, including all its shards' states.
    TenantDescribe {
        #[arg(long)]
        tenant_id: TenantId,
    },
 }
 #[derive(Parser)]
 #[command(
    author,
    version,
    about,
    long_about = "CLI for Storage Controller Support/Debug"
 )]
 #[command(arg_required_else_help(true))]
 struct Cli {
    #[arg(long)]
    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
    api: Url,
    #[arg(long)]
    /// JWT token for authenticating with storage controller.  Depending on the API used, this
    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
    /// a token with both scopes to use with this tool.
    jwt: Option<String>,
    #[command(subcommand)]
    command: Command,
 }
 #[derive(Debug, Clone)]
 struct PlacementPolicyArg(PlacementPolicy);
 impl FromStr for PlacementPolicyArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "detached" => Ok(Self(PlacementPolicy::Detached)),
            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
            _ if s.starts_with("attached:") => {
                let mut splitter = s.split(':');
                let _prefix = splitter.next().unwrap();
                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
                    None => Err(anyhow::anyhow!(
                        "Invalid format '{s}', a valid example is 'attached:1'"
                    )),
                }
            }
            _ => Err(anyhow::anyhow!(
                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
            )),
        }
    }
 }
 #[derive(Debug, Clone)]
 struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
 impl FromStr for ShardSchedulingPolicyArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
            _ => Err(anyhow::anyhow!(
                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
            )),
        }
    }
 }
 #[derive(Debug, Clone)]
 struct NodeAvailabilityArg(NodeAvailabilityWrapper);
 impl FromStr for NodeAvailabilityArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
    }
 }
 struct Client {
    base_url: Url,
    jwt_token: Option<String>,
    client: reqwest::Client,
 }
 impl Client {
    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
        Self {
            base_url,
            jwt_token,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
        }
    }
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
        method: hyper::Method,
        path: String,
        body: Option<RQ>,
    ) -> mgmt_api::Result<RS>
    where
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
        // for general purpose API access.
        let url = Url::from_str(&format!(
            "http://{}:{}/{path}",
            self.base_url.host_str().unwrap(),
            self.base_url.port().unwrap()
        ))
        .unwrap();
        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
            builder = builder.json(&body)
        }
        if let Some(jwt_token) = &self.jwt_token {
            builder = builder.header(
                reqwest::header::AUTHORIZATION,
                format!("Bearer {jwt_token}"),
            );
        }
        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
        let response = response.error_from_body().await?;
        response
            .json()
            .await
            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
    }
 }
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
    let mut trimmed = cli.api.to_string();
    trimmed.pop();
    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
    match cli.command {
        Command::NodeRegister {
            node_id,
            listen_pg_addr,
            listen_pg_port,
            listen_http_addr,
            listen_http_port,
        } => {
            storcon_client
                .dispatch::<_, ()>(
                    Method::POST,
                    "control/v1/node".to_string(),
                    Some(NodeRegisterRequest {
                        node_id,
                        listen_pg_addr,
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
                    }),
                )
                .await?;
        }
        Command::TenantCreate { tenant_id } => {
            vps_client
                .tenant_create(&TenantCreateRequest {
                    new_tenant_id: TenantShardId::unsharded(tenant_id),
                    generation: None,
                    shard_parameters: ShardParameters::default(),
                    placement_policy: Some(PlacementPolicy::Attached(1)),
                    config: TenantConfig::default(),
                })
                .await?;
        }
        Command::TenantDelete { tenant_id } => {
            let status = vps_client
                .tenant_delete(TenantShardId::unsharded(tenant_id))
                .await?;
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
            let resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
                table.add_row([
                    format!("{}", node.id),
                    node.listen_http_addr,
                    format!("{:?}", node.scheduling),
                    format!("{:?}", node.availability),
                ]);
            }
            println!("{table}");
        }
        Command::NodeConfigure {
            node_id,
            availability,
            scheduling,
        } => {
            let req = NodeConfigureRequest {
                node_id,
                availability: availability.map(|a| a.0),
                scheduling,
            };
            storcon_client
                .dispatch::<_, ()>(
                    Method::PUT,
                    format!("control/v1/node/{node_id}/config"),
                    Some(req),
                )
                .await?;
        }
        Command::Tenants {} => {
            let resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
                "ShardCount",
                "StripeSize",
                "Placement",
                "Scheduling",
            ]);
            for tenant in resp {
                let shard_zero = tenant.shards.into_iter().next().unwrap();
                table.add_row([
                    format!("{}", tenant.tenant_id),
                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                    format!("{:?}", tenant.stripe_size),
                    format!("{:?}", tenant.policy),
                    format!("{:?}", shard_zero.scheduling_policy),
                ]);
            }
            println!("{table}");
        }
        Command::TenantPolicy {
            tenant_id,
            placement,
            scheduling,
        } => {
            let req = TenantPolicyRequest {
                scheduling: scheduling.map(|s| s.0),
                placement: placement.map(|p| p.0),
            };
            storcon_client
                .dispatch::<_, ()>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_id}/policy"),
                    Some(req),
                )
                .await?;
        }
        Command::TenantShardSplit {
            tenant_id,
            shard_count,
            stripe_size,
        } => {
            let req = TenantShardSplitRequest {
                new_shard_count: shard_count,
                new_stripe_size: stripe_size.map(ShardStripeSize),
            };
            let response = storcon_client
                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_id}/shard_split"),
                    Some(req),
                )
                .await?;
            println!(
                "Split tenant {} into {} shards: {}",
                tenant_id,
                shard_count,
                response
                    .new_shards
                    .iter()
                    .map(|s| format!("{:?}", s))
                    .collect::<Vec<_>>()
                    .join(",")
            );
        }
        Command::TenantShardMigrate {
            tenant_shard_id,
            node,
        } => {
            let req = TenantShardMigrateRequest {
                tenant_shard_id,
                node_id: node,
            };
            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
                    Some(req),
                )
                .await?;
        }
        Command::TenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;
            vps_client
                .tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: tenant_conf,
                })
                .await?;
        }
        Command::TenantScatter { tenant_id } => {
            // Find the shards
            let locate_response = storcon_client
                .dispatch::<(), TenantLocateResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}/locate"),
                    None,
                )
                .await?;
            let shards = locate_response.shards;
            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
            let shard_count = shards.len();
            for s in shards {
                let entry = node_to_shards.entry(s.node_id).or_default();
                entry.push(s.shard_id);
            }
            // Load list of available nodes
            let nodes_resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            for node in nodes_resp {
                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
                    node_to_shards.entry(node.id).or_default();
                }
            }
            let max_shard_per_node = shard_count / node_to_shards.len();
            loop {
                let mut migrate_shard = None;
                for shards in node_to_shards.values_mut() {
                    if shards.len() > max_shard_per_node {
                        // Pick the emptiest
                        migrate_shard = Some(shards.pop().unwrap());
                    }
                }
                let Some(migrate_shard) = migrate_shard else {
                    break;
                };
                // Pick the emptiest node to migrate to
                let mut destinations = node_to_shards
                    .iter()
                    .map(|(k, v)| (k, v.len()))
                    .collect::<Vec<_>>();
                destinations.sort_by_key(|i| i.1);
                let (destination_node, destination_count) = *destinations.first().unwrap();
                if destination_count + 1 > max_shard_per_node {
                    // Even the emptiest destination doesn't have space: we're done
                    break;
                }
                let destination_node = *destination_node;
                node_to_shards
                    .get_mut(&destination_node)
                    .unwrap()
                    .push(migrate_shard);
                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
                storcon_client
                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                        Method::PUT,
                        format!("control/v1/tenant/{migrate_shard}/migrate"),
                        Some(TenantShardMigrateRequest {
                            tenant_shard_id: migrate_shard,
                            node_id: destination_node,
                        }),
                    )
                    .await?;
                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
            }
            // Spread the shards across the nodes
        }
        Command::TenantDescribe { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await?;
            let shards = describe_response.shards;
            let mut table = comfy_table::Table::new();
            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
            for shard in shards {
                let secondary = shard
                    .node_secondary
                    .iter()
                    .map(|n| format!("{}", n))
                    .collect::<Vec<_>>()
                    .join(",");
                let mut status_parts = Vec::new();
                if shard.is_reconciling {
                    status_parts.push("reconciling");
                }
                if shard.is_pending_compute_notification {
                    status_parts.push("pending_compute");
                }
                if shard.is_splitting {
                    status_parts.push("splitting");
                }
                let status = status_parts.join(",");
                table.add_row([
                    format!("{}", shard.tenant_shard_id),
                    shard
                        .node_attached
                        .map(|n| format!("{}", n))
                        .unwrap_or(String::new()),
                    secondary,
                    shard.last_error,
                    status,
                ]);
            }
            println!("{table}");
        }
    }
    Ok(())
 }
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli
 [print_schema]
-file = "control_plane/attachment_service/src/schema.rs"
+file = "storage_controller/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]
 [migrations_directory]
-dir = "control_plane/attachment_service/migrations"
+dir = "storage_controller/migrations"
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)
 `storage_controller`:
 Neon storage controller, manages a cluster of pageservers and exposes an API that enables
 managing a many-sharded tenant as a single entity.
 `/control_plane`:
 Local control plane.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,9 +2,9 @@ use std::str::FromStr;
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`attachment_service::http`]
+/// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TenantId};
 use crate::{
    models::{ShardParameters, TenantConfig},
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
    pub scheduling: Option<NodeSchedulingPolicy>,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantPolicyRequest {
    pub placement: Option<PlacementPolicy>,
    pub scheduling: Option<ShardSchedulingPolicy>,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -62,12 +68,27 @@ pub struct TenantLocateResponse {
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
    pub id: NodeId,
    pub availability: NodeAvailabilityWrapper,
    pub scheduling: NodeSchedulingPolicy,
    pub listen_http_addr: String,
    pub listen_http_port: u16,
    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
@@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
    pub scheduling_policy: ShardSchedulingPolicy,
 }
 /// Explicitly migrating a particular shard is a low level operation
@@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
 pub struct UtilizationScore(pub u64);
 impl UtilizationScore {
@@ -106,7 +129,7 @@ impl UtilizationScore {
    }
 }
-#[derive(Serialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
@@ -129,7 +152,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
@@ -155,22 +178,33 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }
-impl FromStr for NodeAvailability {
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-    type Err = anyhow::Error;
+pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
    // for non-essential optimization.
    Active,
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
+    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
-        match s {
+    // For example, this still permits a node's attachment location to change to a secondary in
-            // This is used when parsing node configuration requests from neon-local.
+    // response to a node failure, or to assign a new secondary if a node was removed.
-            // Assume the worst possible utilisation score
+    Essential,
-            // and let it get updated via the heartbeats.
+
-            "active" => Ok(Self::Active(UtilizationScore::worst())),
+    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
-            "offline" => Ok(Self::Offline),
+    // unavailable, it will not be rescheduled to another node.
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+    Pause,
-        }
+
    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
    Stop,
 }
 impl Default for ShardSchedulingPolicy {
    fn default() -> Self {
        Self::Active
    }
 }
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -301,6 +301,7 @@ pub struct TenantConfig {
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -565,6 +565,16 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);
 impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
    fn from(arr: [(&str, &str); N]) -> Self {
        let map: HashMap<String, String> = arr
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        Self(map)
    }
 }
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -219,7 +219,6 @@ enum MaybeEnabledStorage {
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -248,7 +247,6 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,6 +182,18 @@ where
        }
    }
    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
        let internal = self.internal.lock().unwrap();
        let cnt = internal.current.cnt_value();
        drop(internal);
        if cnt >= num {
            Ok(())
        } else {
            Err(cnt)
        }
    }
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,25 +27,25 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-03-20 on i3en.3xlarge
+//! 2024-04-04 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
+//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
+//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
+//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
+//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
+//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
+//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
+//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
+//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
+//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
+//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
+//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
+//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
+//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
+//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
+//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
+//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
 //! ```
 use bytes::{Buf, Bytes};
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -128,12 +128,12 @@ impl Client {
    pub async fn timeline_info(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );
@@ -151,11 +151,11 @@ impl Client {
    pub async fn keyspace(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
            self.mgmt_api_endpoint
        );
        self.get(&uri)
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
    fanout: u64,
    ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
+    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
    let exp_base = fanout.max(2);
    // Start at L0
    let mut current_level_no = 0;
    let mut current_level_target_height = target_file_size;
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            break;
        }
        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
    }
    Ok(())
 }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,4 +1,5 @@
 use anyhow::Context;
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;
@@ -95,7 +96,7 @@ async fn main_impl(
            let timeline = *timeline;
            let info = mgmt_api_client
                .timeline_info(
-                    timeline.tenant_id,
+                    TenantShardId::unsharded(timeline.tenant_id),
                    timeline.timeline_id,
                    ForceAwaitLogicalSize::No,
                )
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -173,7 +174,10 @@ async fn main_impl(
                let timeline = *timeline;
                async move {
                    let partitioning = mgmt_api_client
-                        .keyspace(timeline.tenant_id, timeline.timeline_id)
+                        .keyspace(
                            TenantShardId::unsharded(timeline.tenant_id),
                            timeline.timeline_id,
                        )
                        .await?;
                    let lsn = partitioning.at_lsn;
                    let start = Instant::now();
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use humantime::Duration;
 use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
            let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                .timeline_info(
                    TenantShardId::unsharded(tl.tenant_id),
                    tl.timeline_id,
                    ForceAwaitLogicalSize::Yes,
                )
                .await
                .unwrap();
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                        .timeline_info(
                            TenantShardId::unsharded(tl.tenant_id),
                            tl.timeline_id,
                            ForceAwaitLogicalSize::Yes,
                        )
                        .await
                        .unwrap();
                }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;
@@ -671,42 +672,37 @@ fn start_pageserver(
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
    // All started up! Now just sit and wait for shutdown signal.
    {
        use signal_hook::consts::*;
        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
            let mut signals =
                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
            return signals
                .forever()
                .next()
                .expect("forever() never returns None unless explicitly closed");
        });
        let signal = BACKGROUND_RUNTIME
            .block_on(signal_handler)
            .expect("join error");
        match signal {
            SIGQUIT => {
                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
                std::process::exit(111);
            }
            SIGINT | SIGTERM => {
                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
-                // This cancels the `shutdown_pageserver` cancellation tree.
+    {
-                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+        BACKGROUND_RUNTIME.block_on(async move {
-                // The plan is to change that over time.
+            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-                shutdown_pageserver.take();
+            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-                let bg_remote_storage = remote_storage.clone();
+            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-                let bg_deletion_queue = deletion_queue.clone();
+            let signal = tokio::select! {
-                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                _ = sigquit.recv() => {
-                    &tenant_manager,
+                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    std::process::exit(111);
-                    0,
+                }
-                ));
+                _ = sigint.recv() => { "SIGINT" },
-                unreachable!()
+                _ = sigterm.recv() => { "SIGTERM" },
-            }
+            };
-            _ => unreachable!(),
+
-        }
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
            // This cancels the `shutdown_pageserver` cancellation tree.
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
            let bg_remote_storage = remote_storage.clone();
            let bg_deletion_queue = deletion_queue.clone();
            pageserver::shutdown_pageserver(
                &tenant_manager,
                bg_remote_storage.map(|_| bg_deletion_queue),
                0,
            )
            .await;
            unreachable!()
        })
    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -12,7 +12,7 @@ use pageserver_api::{
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, generation::Generation, id::NodeId};
+use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 use crate::{
    config::{NodeMetadata, PageServerConf},
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };
-        fail::fail_point!("control-plane-client-validate");
+        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
        if self.cancel.is_cancelled() {
            return Err(RetryForeverError::ShuttingDown);
        }
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1629,7 +1629,7 @@ components:
          type: integer
          format: int64
          minimum: 0
-          description: The amount of disk space currently utilized by layer files.
+          description: The amount of disk space currently used.
        free_space_bytes:
          type: integer
          format: int64
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -993,11 +993,26 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
    let activate = true;
    #[cfg(feature = "testing")]
    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
    let tenant_info = async {
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        if activate {
            // This is advisory: we prefer to let the tenant activate on-demand when this function is
            // called, but it is still valid to return 200 and describe the current state of the tenant
            // if it doesn't make it into an active state.
            tenant
                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
                .await
                .ok();
        }
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
@@ -170,7 +171,10 @@ async fn import_rel(
        let r = reader.read_exact(&mut buf).await;
        match r {
            Ok(_) => {
-                modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                let key = rel_block_to_key(rel, blknum);
                if modification.tline.get_shard_identity().is_key_local(&key) {
                    modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
                }
            }
            // TODO: UnexpectedEof is expected
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });
 pub(crate) struct WalIngestMetrics {
    pub(crate) bytes_received: IntCounter,
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
 }
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
    bytes_received: register_int_counter!(
        "pageserver_wal_ingest_bytes_received",
        "Bytes of WAL ingested from safekeepers",
    )
    .unwrap(),
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
        "Number of WAL records received from safekeepers"
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -876,7 +876,13 @@ impl PageServerHandler {
            if lsn <= last_record_lsn {
                lsn = last_record_lsn;
            } else {
-                timeline.wait_lsn(lsn, ctx).await?;
+                timeline
                    .wait_lsn(
                        lsn,
                        crate::tenant::timeline::WaitLsnWaiter::PageService,
                        ctx,
                    )
                    .await?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -888,7 +894,13 @@ impl PageServerHandler {
                    "invalid LSN(0) in request".into(),
                ));
            }
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
                .wait_lsn(
                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
        }
        if lsn < **latest_gc_cutoff_lsn {
@@ -1215,7 +1227,13 @@ impl PageServerHandler {
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
                .wait_lsn(
                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -214,13 +214,12 @@ pub enum TaskKind {
    /// Internally, `Client` hands over requests to the `Connection` object.
    /// The `Connection` object is responsible for speaking the wire protocol.
    ///
-    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
    /// That abstraction doesn't use `task_mgr`.
    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
-    /// Once the connection is established, the `TaskHandle` task creates a
+    /// Once the connection is established, the `TaskHandle` task spawns a
-    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -230,7 +229,6 @@ pub enum TaskKind {
    WalReceiverManager,
    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
    ///
    /// [`WalReceiverManager`]: Self::WalReceiverManager
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,6 +12,7 @@
 //!
 use anyhow::{bail, Context};
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::{Mutex, RwLock};
+use std::sync::Mutex;
 use std::time::{Duration, Instant};
 use crate::span;
@@ -260,7 +261,7 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
    tenant_shard_id: TenantShardId,
@@ -1515,7 +1516,7 @@ impl Tenant {
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
+                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1606,7 +1607,7 @@ impl Tenant {
        );
        {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
@@ -1633,7 +1634,7 @@ impl Tenant {
        }
        {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(());
@@ -1782,7 +1783,7 @@ impl Tenant {
    async fn shutdown(
        &self,
        shutdown_progress: completion::Barrier,
-        freeze_and_flush: bool,
+        shutdown_mode: timeline::ShutdownMode,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();
@@ -1829,16 +1830,8 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let timeline_id = timeline.timeline_id;
-
+                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
-                let span =
+                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
                js.spawn(async move {
                    if freeze_and_flush {
                        timeline.flush_and_shutdown().instrument(span).await
                    } else {
                        timeline.shutdown().instrument(span).await
                    }
                });
            })
        };
        // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2082,14 +2075,14 @@ impl Tenant {
    }
    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf.load().location.attach_mode
    }
    /// For API access: generate a LocationConfig equivalent to the one that would be used to
    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
    /// rare external API calls, like a reconciliation at startup.
    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.read().unwrap();
+        let conf = self.tenant_conf.load();
        let location_config_mode = match conf.location.attach_mode {
            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2236,7 +2229,7 @@ where
 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
    }
    pub fn effective_config(&self) -> TenantConf {
@@ -2245,84 +2238,84 @@ impl Tenant {
    }
    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }
    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }
    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }
    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }
    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }
    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }
    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }
    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }
    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }
    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }
    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }
    pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        let heatmap_period = tenant_conf
            .heatmap_period
            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2334,26 +2327,40 @@ impl Tenant {
    }
    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        // Use read-copy-update in order to avoid overwriting the location config
-        self.tenant_conf_updated();
+        // state if this races with [`Tenant::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
        self.tenant_conf.rcu(|inner| {
            Arc::new(AttachedTenantConf {
                tenant_conf: new_tenant_conf.clone(),
                location: inner.location,
            })
        });
        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }
    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        *self.tenant_conf.write().unwrap() = new_conf;
+        let new_tenant_conf = new_conf.tenant_conf.clone();
-        self.tenant_conf_updated();
+
        self.tenant_conf.store(Arc::new(new_conf));
        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }
@@ -2367,11 +2374,8 @@ impl Tenant {
            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
    }
-    pub(crate) fn tenant_conf_updated(&self) {
+    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = {
+        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
            let guard = self.tenant_conf.read().unwrap();
            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
        };
        self.timeline_get_throttle.reconfigure(conf)
    }
@@ -2519,7 +2523,7 @@ impl Tenant {
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
+            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
        }
    }
@@ -3505,7 +3509,7 @@ impl Tenant {
    }
    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
    }
 }
@@ -3653,6 +3657,9 @@ pub(crate) mod harness {
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
            }
        }
    }
@@ -3851,6 +3858,7 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
    use tests::timeline::ShutdownMode;
    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4296,7 +4304,7 @@ mod tests {
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
@@ -4337,7 +4345,7 @@ mod tests {
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
@@ -5118,7 +5126,7 @@ mod tests {
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown()
+                .shutdown(super::timeline::ShutdownMode::Hard)
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,6 +57,9 @@ pub mod defaults {
    // throughputs up to 1GiB/s per timeline.
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
    // By default ingest enough WAL for two new L0 layers before checking if new image
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -362,6 +365,10 @@ pub struct TenantConf {
    pub lazy_slru_download: bool,
    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
    // How much WAL must be ingested before checking again whether a new image layer is required.
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,
 }
 /// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,
 }
 impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
                .timeline_get_throttle
                .clone()
                .unwrap_or(global_conf.timeline_get_throttle),
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
        }
    }
 }
@@ -548,6 +561,7 @@ impl Default for TenantConf {
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
        }
    }
 }
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,7 +14,10 @@ use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
        timeline::ShutdownMode,
    },
 };
 use super::{
@@ -463,7 +466,7 @@ impl DeleteTenantFlow {
        // tenant.shutdown
        // Its also bad that we're holding tenants.read here.
        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
            return Err(DeleteTenantError::Other(anyhow::anyhow!(
                "tenant shutdown is already in progress"
            )));
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,6 +72,10 @@ impl EphemeralFile {
        self.len
    }
    pub(crate) fn id(&self) -> page_cache::FileId {
        self.page_cache_file_id
    }
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,35 +346,6 @@ where
    }
 }
 #[derive(PartialEq, Eq, Hash, Debug, Clone)]
 pub enum InMemoryLayerHandle {
    Open {
        lsn_floor: Lsn,
        end_lsn: Lsn,
    },
    Frozen {
        idx: usize,
        lsn_floor: Lsn,
        end_lsn: Lsn,
    },
 }
 impl InMemoryLayerHandle {
    pub fn get_lsn_floor(&self) -> Lsn {
        match self {
            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
        }
    }
    pub fn get_end_lsn(&self) -> Lsn {
        match self {
            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
        }
    }
 }
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
        self.historic.iter()
    }
-    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
-    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
    ///
    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
    /// the same exclusive region established by holding the layer manager lock.
    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
    where
        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
    {
        if let Some(open) = &self.open_layer {
            if pred(open) {
-                return Some(InMemoryLayerHandle::Open {
+                return Some(open.clone());
                    lsn_floor: open.get_lsn_range().start,
                    end_lsn: open.get_lsn_range().end,
                });
            }
        }
-        let pos = self.frozen_layers.iter().rev().position(pred);
+        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
        pos.map(|rev_idx| {
            let idx = self.frozen_layers.len() - 1 - rev_idx;
            InMemoryLayerHandle::Frozen {
                idx,
                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
            }
        })
    }
    /// Get the layer pointed to by the provided handle.
    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
        match handle {
            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
        }
    }
    ///
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
@@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                            join_set.spawn(
                                async move {
                                    let freeze_and_flush = true;
                                    let res = {
                                        let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
                                    };
                                    if let Err(other_progress) = res {
@@ -1107,7 +1106,7 @@ impl TenantManager {
                };
                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
                    Ok(()) => {}
                    Err(barrier) => {
                        info!("Shutdown already in progress, waiting for it to complete");
@@ -1223,7 +1222,7 @@ impl TenantManager {
                    TenantSlot::Attached(tenant) => {
                        let (_guard, progress) = utils::completion::channel();
                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, false).await {
+                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
                            Ok(()) => {
                                info!("Finished shutting down just-spawned tenant");
                            }
@@ -1273,7 +1272,7 @@ impl TenantManager {
        };
        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
            Ok(()) => {
                slot_guard.drop_old_value()?;
            }
@@ -1649,7 +1648,14 @@ impl TenantManager {
                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                        "failpoint"
                    )));
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                    if let Err(e) = timeline
                        .wait_lsn(
                            *target_lsn,
                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
                            ctx,
                        )
                        .await
                    {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
                        tracing::warn!(
@@ -1670,7 +1676,7 @@ impl TenantManager {
        // Phase 5: Shut down the parent shard, and erase it from disk
        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
+        match parent.shutdown(progress, ShutdownMode::Hard).await {
            Ok(()) => {}
            Err(other) => {
                other.wait().await;
@@ -2657,11 +2663,11 @@ where
    let attached_tenant = match slot_guard.get_old_value() {
        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let freeze_and_flush = false;
+            let shutdown_mode = ShutdownMode::Hard;
            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, shutdown_mode).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -200,6 +200,7 @@ use utils::backoff::{
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
@@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
-use crate::deletion_queue::DeletionQueueClient;
+use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// Doing non-essential flushes of deletion queue is subject to this timeout, after
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -588,14 +593,14 @@ impl RemoteTimelineClient {
        upload_queue: &mut UploadQueueInitialized,
        metadata: TimelineMetadata,
    ) {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
        info!(
-            "scheduling metadata upload with {} files ({} changed)",
+            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
            upload_queue.latest_files.len(),
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
@@ -1050,6 +1055,26 @@ impl RemoteTimelineClient {
        Ok(())
    }
    async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
        match tokio::time::timeout(
            DELETION_QUEUE_FLUSH_TIMEOUT,
            self.deletion_queue_client.flush_immediate(),
        )
        .await
        {
            Ok(result) => result,
            Err(_timeout) => {
                // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
                // to ensure that _usually_ objects are really gone after a DELETE is acked.  However, in case of deletion
                // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
                tracing::warn!(
                    "Timed out waiting for deletion queue flush, acking deletion anyway"
                );
                Ok(())
            }
        }
    }
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1099,7 +1124,7 @@ impl RemoteTimelineClient {
        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
        let cancel = shutdown_token();
@@ -1173,7 +1198,7 @@ impl RemoteTimelineClient {
        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -1569,7 +1594,7 @@ impl RemoteTimelineClient {
    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
    ///
    /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
    pub(crate) fn stop(&self) {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
            // Existing on-disk layers: just update their access time.
            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                tracing::debug!("Layer {} is already on disk", layer.name);
                if cfg!(debug_assertions) {
                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                    // are already present on disk are really there.
                    let local_path = self
                        .conf
                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
                        .join(layer.name.file_name());
                    match tokio::fs::metadata(&local_path).await {
                        Ok(meta) => {
                            tracing::debug!(
                                "Layer {} present at {}, size {}",
                                layer.name,
                                local_path,
                                meta.len(),
                            );
                        }
                        Err(e) => {
                            tracing::warn!(
                                "Layer {} not found at {} ({})",
                                layer.name,
                                local_path,
                                e
                            );
                            debug_assert!(false);
                        }
                    }
                }
                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                    || on_disk.access_time != layer.access_time
                {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
-use super::layer_map::InMemoryLayerHandle;
+use self::inmemory_layer::InMemoryLayerFileId;
-use super::timeline::layer_manager::LayerManager;
+
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
    }
 }
-/// Description of layer to be read - the layer map can turn
+/// A key that uniquely identifies a layer in a timeline
-/// this description into the actual layer.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum LayerId {
-pub(crate) enum ReadableLayerDesc {
+    PersitentLayerId(PersistentLayerKey),
-    Persistent {
+    InMemoryLayerId(InMemoryLayerFileId),
        desc: PersistentLayerDesc,
        lsn_range: Range<Lsn>,
    },
    InMemory {
        handle: InMemoryLayerHandle,
        lsn_ceil: Lsn,
    },
 }
-/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+/// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
 #[derive(Debug)]
-struct ReadableLayerDescOrdered(ReadableLayerDesc);
+pub(crate) enum ReadableLayer {
    PersistentLayer(Layer),
    InMemoryLayer(Arc<InMemoryLayer>),
 }
 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
 struct ReadDesc {
    /// An id used to resolve the readable layer within the fringe
    layer_id: LayerId,
    /// Lsn range for the read, used for selecting the next read
    lsn_range: Range<Lsn>,
 }
 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<ReadableLayerDesc, KeySpace>,
+    layers: HashMap<LayerId, LayerKeyspace>,
 }
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
    target_keyspace: KeySpace,
 }
 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            layers_by_lsn: BinaryHeap::new(),
+            planned_reads_by_lsn: BinaryHeap::new(),
            layers: HashMap::new(),
        }
    }
-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let handle = match self.layers_by_lsn.pop() {
+        let read_desc = match self.planned_reads_by_lsn.pop() {
-            Some(h) => h,
+            Some(desc) => desc,
            None => return None,
        };
-        let removed = self.layers.remove_entry(&handle.0);
+        let removed = self.layers.remove_entry(&read_desc.layer_id);
        match removed {
-            Some((layer, keyspace)) => Some((layer, keyspace)),
+            Some((
                _,
                LayerKeyspace {
                    layer,
                    target_keyspace,
                },
            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }
-    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+    pub(crate) fn update(
-        let entry = self.layers.entry(layer.clone());
+        &mut self,
        layer: ReadableLayer,
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
    ) {
        let layer_id = layer.id();
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().merge(&keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
            }
            Entry::Vacant(entry) => {
-                self.layers_by_lsn
+                self.planned_reads_by_lsn.push(ReadDesc {
-                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                    lsn_range,
-                entry.insert(keyspace);
+                    layer_id: layer_id.clone(),
                });
                entry.insert(LayerKeyspace {
                    layer,
                    target_keyspace: keyspace,
                });
            }
        }
    }
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
    }
 }
-impl Ord for ReadableLayerDescOrdered {
+impl Ord for ReadDesc {
    fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
        if ord == std::cmp::Ordering::Equal {
-            self.0
+            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
                .get_lsn_floor()
                .cmp(&other.0.get_lsn_floor())
                .reverse()
        } else {
            ord
        }
    }
 }
-impl PartialOrd for ReadableLayerDescOrdered {
+impl PartialOrd for ReadDesc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }
-impl PartialEq for ReadableLayerDescOrdered {
+impl PartialEq for ReadDesc {
    fn eq(&self, other: &Self) -> bool {
-        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+        self.lsn_range == other.lsn_range
            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
    }
 }
-impl Eq for ReadableLayerDescOrdered {}
+impl Eq for ReadDesc {}
-impl ReadableLayerDesc {
+impl ReadableLayer {
-    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+    pub(crate) fn id(&self) -> LayerId {
        match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
+            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
-            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
        }
    }
    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
        match self {
            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
        }
    }
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        layer_manager: &LayerManager,
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayerDesc::Persistent { desc, lsn_range } => {
+            ReadableLayer::PersistentLayer(layer) => {
                let layer = layer_manager.get_from_desc(desc);
                layer
-                    .get_values_reconstruct_data(
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                        keyspace,
                        lsn_range.clone(),
                        reconstruct_state,
                        ctx,
                    )
                    .await
            }
-            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+            ReadableLayer::InMemoryLayer(layer) => {
                let layer = layer_manager
                    .layer_map()
                    .get_in_memory_layer(handle)
                    .unwrap();
                layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -946,6 +947,34 @@ impl DeltaLayerInner {
        Ok(planner.finish())
    }
    fn get_min_read_buffer_size(
        planned_reads: &[VectoredRead],
        read_size_soft_max: usize,
    ) -> usize {
        let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
            return read_size_soft_max;
        };
        let largest_read_size = largest_read.size();
        if largest_read_size > read_size_soft_max {
            // If the read is oversized, it should only contain one key.
            let offenders = largest_read
                .blobs_at
                .as_slice()
                .iter()
                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                .join(", ");
            tracing::warn!(
                "Oversized vectored read ({} > {}) for keys {}",
                largest_read_size,
                read_size_soft_max,
                offenders
            );
        }
        largest_read_size
    }
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -959,7 +988,8 @@ impl DeltaLayerInner {
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
-        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
        let mut buf = Some(BytesMut::with_capacity(buf_size));
        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -986,7 +1016,7 @@ impl DeltaLayerInner {
                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+                    buf = Some(BytesMut::with_capacity(buf_size));
                    continue;
                }
@@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 mod test {
    use std::collections::BTreeMap;
    use itertools::MinMaxResult;
    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
    use rand::RngCore;
    use super::*;
    use crate::{
-        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+        context::DownloadBehavior,
        task_mgr::TaskKind,
        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
        DEFAULT_PG_VERSION,
    };
    /// Construct an index for a fictional delta layer and and then
@@ -1332,4 +1369,229 @@ mod test {
        assert_eq!(planned_blobs, expected_blobs);
    }
    mod constants {
        use utils::lsn::Lsn;
        /// Offset used by all lsns in this test
        pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
        /// Number of unique keys including in the test data
        pub(super) const KEY_COUNT: u8 = 60;
        /// Max number of different lsns for each key
        pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
        /// Possible value sizes for each key along with a probability weight
        pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
        /// Probability that there will be a gap between the current key and the next one (33.3%)
        pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
        /// The minimum size of a key range in all the generated reads
        pub(super) const MIN_RANGE_SIZE: i128 = 10;
        /// The number of ranges included in each vectored read
        pub(super) const RANGES_COUNT: u8 = 2;
        /// The number of vectored reads performed
        pub(super) const READS_COUNT: u8 = 100;
        /// Soft max size of a vectored read. Will be violated if we have to read keys
        /// with values larger than the limit
        pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
    }
    struct Entry {
        key: Key,
        lsn: Lsn,
        value: Vec<u8>,
    }
    fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
        let mut current_key = Key::MIN;
        let mut entries = Vec::new();
        for _ in 0..constants::KEY_COUNT {
            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
            let mut lsns_iter =
                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
                    Some(Lsn(lsn.0 + 0x08))
                });
            let mut lsns = Vec::new();
            while lsns.len() < count as usize {
                let take = rng.gen_bool(0.5);
                let lsn = lsns_iter.next().unwrap();
                if take {
                    lsns.push(lsn);
                }
            }
            for lsn in lsns {
                let size = constants::VALUE_SIZES
                    .choose_weighted(rng, |item| item.1)
                    .unwrap()
                    .0;
                let mut buf = vec![0; size];
                rng.fill_bytes(&mut buf);
                entries.push(Entry {
                    key: current_key,
                    lsn,
                    value: buf,
                })
            }
            let gap = constants::KEY_GAP_CHANGES
                .choose_weighted(rng, |item| item.1)
                .unwrap()
                .0;
            if gap {
                current_key = current_key.add(2);
            } else {
                current_key = current_key.add(1);
            }
        }
        entries
    }
    struct EntriesMeta {
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
        index: BTreeMap<(Key, Lsn), Vec<u8>>,
    }
    fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
        let key_range = match entries.iter().minmax_by_key(|e| e.key) {
            MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
            _ => panic!("More than one entry is always expected"),
        };
        let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
            MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
            _ => panic!("More than one entry is always expected"),
        };
        let mut index = BTreeMap::new();
        for entry in entries.iter() {
            index.insert((entry.key, entry.lsn), entry.value.clone());
        }
        EntriesMeta {
            key_range,
            lsn_range,
            index,
        }
    }
    fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
        let start = key_range.start.to_i128();
        let end = key_range.end.to_i128();
        let mut keyspace = KeySpace::default();
        for _ in 0..constants::RANGES_COUNT {
            let mut range: Option<Range<Key>> = Option::default();
            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
                let range_start = rng.gen_range(start..end);
                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
                if range_end_offset >= end {
                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
                } else {
                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
                }
            }
            keyspace.ranges.push(range.unwrap());
        }
        keyspace
    }
    #[tokio::test]
    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
        let (tenant, ctx) = harness.load().await;
        let timeline_id = TimelineId::generate();
        let timeline = tenant
            .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
            .await?;
        tracing::info!("Generating test data ...");
        let rng = &mut StdRng::seed_from_u64(0);
        let entries = generate_entries(rng);
        let entries_meta = get_entries_meta(&entries);
        tracing::info!("Done generating {} entries", entries.len());
        tracing::info!("Writing test data to delta layer ...");
        let mut writer = DeltaLayerWriter::new(
            harness.conf,
            timeline_id,
            harness.tenant_shard_id,
            entries_meta.key_range.start,
            entries_meta.lsn_range.clone(),
        )
        .await?;
        for entry in entries {
            let (_, res) = writer
                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
                .await;
            res?;
        }
        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
        let inner = resident.get_inner_delta(&ctx).await?;
        let file_size = inner.file.metadata().await?.len();
        tracing::info!(
            "Done writing test data to delta layer. Resulting file size is: {}",
            file_size
        );
        for i in 0..constants::READS_COUNT {
            tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
            let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
            let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
                block_reader,
            );
            let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
            let mut reconstruct_state = ValuesReconstructState::new();
            let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
            let vectored_reads = DeltaLayerInner::plan_reads(
                keyspace.clone(),
                entries_meta.lsn_range.clone(),
                data_end_offset,
                index_reader,
                planner,
                &mut reconstruct_state,
                &ctx,
            )
            .await?;
            let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
            let buf_size = DeltaLayerInner::get_min_read_buffer_size(
                &vectored_reads,
                constants::MAX_VECTORED_READ_BYTES,
            );
            let mut buf = Some(BytesMut::with_capacity(buf_size));
            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
                    .await?;
                for meta in blobs_buf.blobs.iter() {
                    let value = &blobs_buf.buf[meta.start..meta.end];
                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
                }
                buf = Some(blobs_buf.buf);
            }
        }
        Ok(())
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -540,7 +541,25 @@ impl ImageLayerInner {
        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
-            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let buf_size = read.size();
            if buf_size > max_vectored_read_bytes {
                // If the read is oversized, it should only contain one key.
                let offenders = read
                    .blobs_at
                    .as_slice()
                    .iter()
                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                    .join(", ");
                tracing::warn!(
                    "Oversized vectored read ({} > {}) for keys {}",
                    buf_size,
                    max_vectored_read_bytes,
                    offenders
                );
            }
            let buf = BytesMut::with_capacity(buf_size);
            let res = vectored_blob_reader.read_blobs(&read, buf).await;
            match res {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::walrecord;
+use crate::{page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -36,10 +36,14 @@ use super::{
    ValuesReconstructState,
 };
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    file_id: InMemoryLayerFileId,
    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
@@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
 };
 impl InMemoryLayer {
    pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
        self.file_id
    }
    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }
@@ -443,8 +451,10 @@ impl InMemoryLayer {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
        let key = InMemoryLayerFileId(file.id());
        Ok(InMemoryLayer {
            file_id: key,
            conf,
            timeline_id,
            tenant_shard_id,
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1759,6 +1759,18 @@ impl ResidentLayer {
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.owner.metadata()
    }
    #[cfg(test)]
    pub(crate) async fn get_inner_delta<'a>(
        &'a self,
        ctx: &RequestContext,
    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
        let owner = &self.owner.0;
        match self.downloaded.get(owner, ctx).await? {
            LayerKind::Delta(d) => Ok(d),
            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
        }
    }
 }
 impl AsLayerDesc for ResidentLayer {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,6 +9,7 @@ pub mod uninit;
 mod walreceiver;
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
@@ -118,11 +119,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
-use super::remote_timeline_client::RemoteTimelineClient;
+use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {
 pub struct Timeline {
    conf: &'static PageServerConf,
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
    myself: Weak<Self>,
@@ -281,10 +282,12 @@ pub struct Timeline {
    pub(super) flush_loop_state: Mutex<FlushLoopState>,
    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
-    /// The value is a counter, incremented every time a new flush cycle is requested.
+    /// - The u64 value is a counter, incremented every time a new flush cycle is requested.
-    /// The flush cycle counter is sent back on the layer_flush_done channel when
+    ///   The flush cycle counter is sent back on the layer_flush_done channel when
-    /// the flush finishes. You can use that to wait for the flush to finish.
+    ///   the flush finishes. You can use that to wait for the flush to finish.
-    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
+    /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
    ///   read by whoever sends an update
    layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
@@ -309,6 +312,8 @@ pub struct Timeline {
    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
    last_image_layer_creation_check_at: AtomicLsn,
    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,
@@ -610,6 +615,25 @@ pub enum GetVectoredImpl {
    Vectored,
 }
 pub(crate) enum WaitLsnWaiter<'a> {
    Timeline(&'a Timeline),
    Tenant,
    PageService,
 }
 /// Argument to [`Timeline::shutdown`].
 #[derive(Debug, Clone, Copy)]
 pub(crate) enum ShutdownMode {
    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
    ///
    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
    /// the call to [`Timeline::shutdown`].
    FreezeAndFlush,
    /// Shut down immediately, without waiting for any open layers to flush.
    Hard,
 }
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -1058,7 +1082,8 @@ impl Timeline {
    pub(crate) async fn wait_lsn(
        &self,
        lsn: Lsn,
-        _ctx: &RequestContext, /* Prepare for use by cancellation */
+        who_is_waiting: WaitLsnWaiter<'_>,
        ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
        if self.cancel.is_cancelled() {
            return Err(WaitLsnError::Shutdown);
@@ -1066,20 +1091,28 @@ impl Timeline {
            return Err(WaitLsnError::BadState);
        }
-        // This should never be called from the WAL receiver, because that could lead
+        if cfg!(debug_assertions) {
-        // to a deadlock.
+            match ctx.task_kind() {
-        debug_assert!(
+                TaskKind::WalReceiverManager
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+                | TaskKind::WalReceiverConnectionHandler
-            "wait_lsn cannot be called in WAL receiver"
+                | TaskKind::WalReceiverConnectionPoller => {
-        );
+                    let is_myself = match who_is_waiting {
-        debug_assert!(
+                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
-            "wait_lsn cannot be called in WAL receiver"
+                    };
-        );
+                    if is_myself {
-        debug_assert!(
+                        if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
+                            // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
-            "wait_lsn cannot be called in WAL receiver"
+                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
-        );
+                        }
                    } else {
                        // if another  timeline's  is waiting for us, there's no deadlock risk because
                        // our walreceiver task can make progress independent of theirs
                    }
                }
                _ => {}
            }
        }
        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
@@ -1138,8 +1171,8 @@ impl Timeline {
    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
-        self.freeze_inmem_layer(false).await;
+        let to_lsn = self.freeze_inmem_layer(false).await;
-        self.flush_frozen_layers_and_wait().await
+        self.flush_frozen_layers_and_wait(to_lsn).await
    }
    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
@@ -1159,7 +1192,39 @@ impl Timeline {
        };
        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
-            // No open layer, no work to do.
+            // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
            // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
            // that didn't result in writes to this shard.
            // Must not hold the layers lock while waiting for a flush.
            drop(layers_guard);
            let last_record_lsn = self.get_last_record_lsn();
            let disk_consistent_lsn = self.get_disk_consistent_lsn();
            if last_record_lsn > disk_consistent_lsn {
                // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
                // we are a sharded tenant and have skipped some WAL
                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
                if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
                    // This should be somewhat rare, so we log it at INFO level.
                    //
                    // We checked for checkpoint timeout so that a shard without any
                    // data ingested (yet) doesn't write a remote index as soon as it
                    // sees its LSN advance: we only do this if we've been layer-less
                    // for some time.
                    tracing::info!(
                        "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
                        disk_consistent_lsn,
                        last_record_lsn
                    );
                    // The flush loop will update remote consistent LSN as well as disk consistent LSN.
                    self.flush_frozen_layers_and_wait(last_record_lsn)
                        .await
                        .ok();
                }
            }
            return;
        };
@@ -1288,83 +1353,119 @@ impl Timeline {
        self.launch_eviction_task(parent, background_jobs_can_start);
    }
-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// After this function returns, there are no timeline-scoped tasks are left running.
    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
    ///
-    /// While we are flushing, we continue to accept read I/O.
+    /// The preferred pattern for is:
-    pub(crate) async fn flush_and_shutdown(&self) {
+    /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
    /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
    ///   go the extra mile and keep track of JoinHandles
    /// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
    ///   instead of spawning directly on a runtime. It is a more composable / testable pattern.
    ///
    /// For legacy reasons, we still have multiple tasks spawned using
    /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
    /// We refer to these as "timeline-scoped task_mgr tasks".
    /// Some of these tasks are already sensitive to Timeline::cancel while others are
    /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
    /// or [`task_mgr::shutdown_watcher`].
    /// We want to gradually convert the code base away from these.
    ///
    /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
    /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
    /// ones that aren't mentioned here):
    /// - [`TaskKind::TimelineDeletionWorker`]
    ///    - NB: also used for tenant deletion
    /// - [`TaskKind::RemoteUploadTask`]`
    /// - [`TaskKind::InitialLogicalSizeCalculation`]
    /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
    // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
    /// - [`TaskKind::Eviction`]
    /// - [`TaskKind::LayerFlushTask`]
    /// - [`TaskKind::OndemandLogicalSizeCalculation`]
    /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
        debug_assert_current_span_has_tenant_and_timeline_id();
-        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
+        let try_freeze_and_flush = match mode {
-        // trying to flush
+            ShutdownMode::FreezeAndFlush => true,
-        tracing::debug!("Waiting for WalReceiverManager...");
+            ShutdownMode::Hard => false,
-        task_mgr::shutdown_tasks(
+        };
            Some(TaskKind::WalReceiverManager),
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
        )
        .await;
-        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        // Regardless of whether we're going to try_freeze_and_flush
        // or not, stop ingesting any more data. Walreceiver only provides
        // cancellation but no "wait until gone", because it uses the Timeline::gate.
        // So, only after the self.gate.close() below will we know for sure that
        // no walreceiver tasks are left.
        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
        // data during the call to `self.freeze_and_flush()` below.
        // That's not ideal, but, we don't have the concept of a ChildGuard,
        // which is what we'd need to properly model early shutdown of the walreceiver
        // task sub-tree before the other Timeline task sub-trees.
        let walreceiver = self.walreceiver.lock().unwrap().take();
        tracing::debug!(
            is_some = walreceiver.is_some(),
            "Waiting for WalReceiverManager..."
        );
        if let Some(walreceiver) = walreceiver {
            walreceiver.cancel();
        }
        // ... and inform any waiters for newer LSNs that there won't be any.
        self.last_record_lsn.shutdown();
-        // now all writers to InMemory layer are gone, do the final flush if requested
+        if try_freeze_and_flush {
-        match self.freeze_and_flush().await {
+            // we shut down walreceiver above, so, we won't add anything more
-            Ok(_) => {
+            // to the InMemoryLayer; freeze it and wait for all frozen layers
-                // drain the upload queue
+            // to reach the disk & upload queue, then shut the upload queue and
-                if let Some(client) = self.remote_client.as_ref() {
+            // wait for it to drain.
-                    // if we did not wait for completion here, it might be our shutdown process
+            match self.freeze_and_flush().await {
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                Ok(_) => {
-                    // be spawned.
+                    // drain the upload queue
-                    //
+                    if let Some(client) = self.remote_client.as_ref() {
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // if we did not wait for completion here, it might be our shutdown process
-                    // obviously it does not make sense to stop while we wait for it, but what
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // about corner cases like s3 suddenly hanging up?
+                        // be spawned.
-                    client.shutdown().await;
+                        //
                        // what is problematic is the shutting down of RemoteTimelineClient, because
                        // obviously it does not make sense to stop while we wait for it, but what
                        // about corner cases like s3 suddenly hanging up?
                        client.shutdown().await;
                    }
                }
                Err(e) => {
                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
                    // we have some extra WAL replay to do next time the timeline starts.
                    warn!("failed to freeze and flush: {e:#}");
                }
            }
            Err(e) => {
                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
                // we have some extra WAL replay to do next time the timeline starts.
                warn!("failed to freeze and flush: {e:#}");
            }
        }
        self.shutdown().await;
    }
    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
    /// the graceful [`Timeline::flush_and_shutdown`] function.
    pub(crate) async fn shutdown(&self) {
        debug_assert_current_span_has_tenant_and_timeline_id();
        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();
-        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
+        // Transition the remote_client into a state where it's only useful for timeline deletion.
-        // while doing so.
+        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        self.last_record_lsn.shutdown();
        // Shut down the layer flush task before the remote client, as one depends on the other
        task_mgr::shutdown_tasks(
            Some(TaskKind::LayerFlushTask),
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
        )
        .await;
        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
        // case our caller wants to use that for a deletion
        if let Some(remote_client) = self.remote_client.as_ref() {
            remote_client.stop();
            // As documented in remote_client.stop()'s doc comment, it's our responsibility
            // to shut down the upload queue tasks.
            // TODO: fix that, task management should be encapsulated inside remote_client.
            task_mgr::shutdown_tasks(
                Some(TaskKind::RemoteUploadTask),
                Some(self.tenant_shard_id),
                Some(self.timeline_id),
            )
            .await;
        }
        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
-        // Finally wait until any gate-holders are complete
+        // Finally wait until any gate-holders are complete.
        //
        // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
        // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
        self.gate.close().await;
        self.metrics.shutdown();
@@ -1568,57 +1669,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .lazy_slru_download
            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
    }
    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }
    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }
    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }
    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }
    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }
    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .compaction_algorithm
            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
    }
    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
    }
@@ -1632,14 +1741,26 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }
-    pub(super) fn tenant_conf_updated(&self) {
+    fn get_image_layer_creation_check_threshold(&self) -> u8 {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .image_layer_creation_check_threshold
            .unwrap_or(
                self.conf
                    .default_tenant_conf
                    .image_layer_creation_check_threshold,
            )
    }
    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                &self.tenant_conf.read().unwrap().tenant_conf,
+                new_conf,
                &self.conf.default_tenant_conf,
            );
@@ -1666,7 +1787,7 @@ impl Timeline {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn new(
        conf: &'static PageServerConf,
-        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
@@ -1682,17 +1803,16 @@ impl Timeline {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(state);
-        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
+        let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
-        let tenant_conf_guard = tenant_conf.read().unwrap();
+        let evictions_low_residence_duration_metric_threshold = {
-
+            let loaded_tenant_conf = tenant_conf.load();
        let evictions_low_residence_duration_metric_threshold =
            Self::get_evictions_low_residence_duration_metric_threshold(
-                &tenant_conf_guard.tenant_conf,
+                &loaded_tenant_conf.tenant_conf,
                &conf.default_tenant_conf,
-            );
+            )
-        drop(tenant_conf_guard);
+        };
        Arc::new_cyclic(|myself| {
            let mut result = Timeline {
@@ -1769,6 +1889,7 @@ impl Timeline {
                },
                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),
                last_received_wal: Mutex::new(None),
                rel_size_cache: RwLock::new(HashMap::new()),
@@ -1797,6 +1918,7 @@ impl Timeline {
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
            result
                .metrics
                .last_record_gauge
@@ -1873,20 +1995,19 @@ impl Timeline {
            self.timeline_id, self.tenant_shard_id
        );
-        let tenant_conf_guard = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.load();
-        let wal_connect_timeout = tenant_conf_guard
+        let wal_connect_timeout = tenant_conf
            .tenant_conf
            .walreceiver_connect_timeout
            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
+        let lagging_wal_timeout = tenant_conf
            .tenant_conf
            .lagging_wal_timeout
            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
+        let max_lsn_wal_lag = tenant_conf
            .tenant_conf
            .max_lsn_wal_lag
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
        drop(tenant_conf_guard);
        let mut guard = self.walreceiver.lock().unwrap();
        assert!(
@@ -2434,10 +2555,6 @@ impl Timeline {
                debug!("cancelling logical size calculation for timeline shutdown");
                calculation.await
            }
            _ = task_mgr::shutdown_watcher() => {
                debug!("cancelling logical size calculation for task shutdown");
                calculation.await
            }
        }
    }
@@ -2892,16 +3009,6 @@ impl Timeline {
        let mut completed_keyspace = KeySpace::default();
        // Hold the layer map whilst visiting the timeline to prevent
        // compaction, eviction and flushes from rendering the layers unreadable.
        //
        // TODO: Do we actually need to do this? In theory holding on
        // to [`tenant::storage_layer::Layer`] should be enough. However,
        // [`Timeline::get`] also holds the lock during IO, so more investigation
        // is needed.
        let guard = timeline.layers.read().await;
        let layers = guard.layer_map();
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -2911,6 +3018,9 @@ impl Timeline {
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);
            let guard = timeline.layers.read().await;
            let layers = guard.layer_map();
            let in_memory_layer = layers.find_in_memory_layer(|l| {
                let start_lsn = l.get_lsn_range().start;
                cont_lsn > start_lsn
@@ -2918,12 +3028,11 @@ impl Timeline {
            match in_memory_layer {
                Some(l) => {
                    let lsn_range = l.get_lsn_range().start..cont_lsn;
                    fringe.update(
-                        ReadableLayerDesc::InMemory {
+                        ReadableLayer::InMemoryLayer(l),
                            handle: l,
                            lsn_ceil: cont_lsn,
                        },
                        unmapped_keyspace.clone(),
                        lsn_range,
                    );
                }
                None => {
@@ -2935,30 +3044,43 @@ impl Timeline {
                            .into_iter()
                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
                                (
-                                    ReadableLayerDesc::Persistent {
+                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
                                        desc: (*layer).clone(),
                                        lsn_range: lsn_floor..cont_lsn,
                                    },
                                    keyspace_accum.to_keyspace(),
                                    lsn_floor..cont_lsn,
                                )
                            })
-                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                            .for_each(|(layer, keyspace, lsn_range)| {
                                fringe.update(layer, keyspace, lsn_range)
                            });
                    }
                }
            }
-            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+            // It's safe to drop the layer map lock after planning the next round of reads.
            // The fringe keeps readable handles for the layers which are safe to read even
            // if layers were compacted or flushed.
            //
            // The more interesting consideration is: "Why is the read algorithm still correct
            // if the layer map changes while it is operating?". Doing a vectored read on a
            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
            // covered by the read. The layer map tells us how to move the lsn downwards for a
            // range at *a particular point in time*. It is fine for the answer to be different
            // at two different time points.
            drop(guard);
            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                let next_cont_lsn = lsn_range.start;
                layer_to_read
                    .get_values_reconstruct_data(
                        &guard,
                        keyspace_to_read.clone(),
                        lsn_range,
                        reconstruct_state,
                        ctx,
                    )
                    .await?;
                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = layer_to_read.get_lsn_floor();
+                cont_lsn = next_cont_lsn;
            } else {
                break;
            }
@@ -3036,7 +3158,7 @@ impl Timeline {
            }
        }
        ancestor
-            .wait_lsn(self.ancestor_lsn, ctx)
+            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
            .await
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -3086,7 +3208,9 @@ impl Timeline {
        self.last_record_lsn.advance(new_lsn);
    }
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
+    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
        // Freeze the current open in-memory layer. It will be written to disk on next
        // iteration.
@@ -3096,7 +3220,9 @@ impl Timeline {
            Some(self.write_lock.lock().await)
        };
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+        let to_lsn = self.get_last_record_lsn();
        self.freeze_inmem_layer_at(to_lsn).await;
        to_lsn
    }
    async fn freeze_inmem_layer_at(&self, at: Lsn) {
@@ -3109,25 +3235,24 @@ impl Timeline {
    /// Layer flusher task's main loop.
    async fn flush_loop(
        self: &Arc<Self>,
-        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
        ctx: &RequestContext,
    ) {
        info!("started flush loop");
        loop {
            tokio::select! {
                _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task");
+                    info!("shutting down layer flush task due to Timeline::cancel");
                    break;
                },
                _ = task_mgr::shutdown_watcher() => {
                    info!("shutting down layer flush task");
                    break;
                },
                _ = layer_flush_start_rx.changed() => {}
            }
            trace!("waking up");
-            let flush_counter = *layer_flush_start_rx.borrow();
+            let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
            // The highest LSN to which we flushed in the loop over frozen layers
            let mut flushed_to_lsn = Lsn(0);
            let result = loop {
                if self.cancel.is_cancelled() {
                    info!("dropping out of flush loop for timeline shutdown");
@@ -3148,7 +3273,9 @@ impl Timeline {
                    break Ok(());
                };
                match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(()) => {}
+                    Ok(this_layer_to_lsn) => {
                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
                    }
                    Err(FlushLayerError::Cancelled) => {
                        info!("dropping out of flush loop for timeline shutdown");
                        return;
@@ -3157,11 +3284,36 @@ impl Timeline {
                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                    ) => {
                        error!("could not flush frozen layer: {err:?}");
-                        break err;
+                        break err.map(|_| ());
                    }
                }
                timer.stop_and_record();
            };
            // Unsharded tenants should never advance their LSN beyond the end of the
            // highest layer they write: such gaps between layer data and the frozen LSN
            // are only legal on sharded tenants.
            debug_assert!(
                self.shard_identity.count.count() > 1
                    || flushed_to_lsn >= frozen_to_lsn
                    || !flushed_to_lsn.is_valid()
            );
            if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
                // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
                // to us via layer_flush_start_rx, then advance it here.
                //
                // This path is only taken for tenants with multiple shards: single sharded tenants should
                // never encounter a gap in the wal.
                let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
                tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
                if self.set_disk_consistent_lsn(frozen_to_lsn) {
                    if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
                        tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
                    }
                }
            }
            // Notify any listeners that we're done
            let _ = self
                .layer_flush_done_tx
@@ -3169,7 +3321,13 @@ impl Timeline {
        }
    }
-    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
+    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
    ///
    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
        let mut rx = self.layer_flush_done_tx.subscribe();
        // Increment the flush cycle counter and wake up the flush task.
@@ -3183,9 +3341,10 @@ impl Timeline {
            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }
-        self.layer_flush_start_tx.send_modify(|counter| {
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
            my_flush_request = *counter + 1;
            *counter = my_flush_request;
            *lsn = std::cmp::max(last_record_lsn, *lsn);
        });
        loop {
@@ -3222,16 +3381,22 @@ impl Timeline {
    }
    fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|val| *val += 1);
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
            *counter += 1;
            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
        });
    }
    /// Flush one frozen in-memory layer to disk, as a new delta layer.
    ///
    /// Return value is the last lsn (inclusive) of the layer that was frozen.
    #[instrument(skip_all, fields(layer=%frozen_layer))]
    async fn flush_frozen_layer(
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> Result<(), FlushLayerError> {
+    ) -> Result<Lsn, FlushLayerError> {
        debug_assert_current_span_has_tenant_and_timeline_id();
        // As a special case, when we have just imported an image into the repository,
@@ -3306,7 +3471,6 @@ impl Timeline {
        }
        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now. The flushed layer is stored in
@@ -3320,10 +3484,7 @@ impl Timeline {
            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
-            if disk_consistent_lsn != old_disk_consistent_lsn {
+            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                assert!(disk_consistent_lsn > old_disk_consistent_lsn);
                self.disk_consistent_lsn.store(disk_consistent_lsn);
                // Schedule remote uploads that will reflect our new disk_consistent_lsn
                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
            }
@@ -3340,7 +3501,22 @@ impl Timeline {
        // This failpoint is used by another test case `test_pageserver_recovery`.
        fail_point!("flush-frozen-exit");
-        Ok(())
+        Ok(Lsn(lsn_range.end.0 - 1))
    }
    /// Return true if the value changed
    ///
    /// This function must only be used from the layer flush task, and may not be called concurrently.
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
        let old_value = self.disk_consistent_lsn.load();
        if new_value != old_value {
            assert!(new_value >= old_value);
            self.disk_consistent_lsn.store(new_value);
            true
        } else {
            false
        }
    }
    /// Update metadata file
@@ -3501,6 +3677,24 @@ impl Timeline {
    // Is it time to create a new image layer for the given partition?
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
        let last = self.last_image_layer_creation_check_at.load();
        if lsn != Lsn(0) {
            let distance = lsn
                .checked_sub(last)
                .expect("Attempt to compact with LSN going backwards");
            let min_distance = self.get_image_layer_creation_check_threshold() as u64
                * self.get_checkpoint_distance();
            // Skip the expensive delta layer counting below if we've not ingested
            // sufficient WAL since the last check.
            if distance.0 < min_distance {
                return false;
            }
        }
        self.last_image_layer_creation_check_at.store(lsn);
        let threshold = self.get_image_creation_threshold();
        let guard = self.layers.read().await;
@@ -3842,6 +4036,24 @@ impl Timeline {
        Ok(())
    }
    /// Schedules the uploads of the given image layers
    fn upload_new_image_layers(
        self: &Arc<Self>,
        new_images: impl IntoIterator<Item = ResidentLayer>,
    ) -> anyhow::Result<()> {
        let Some(remote_client) = &self.remote_client else {
            return Ok(());
        };
        for layer in new_images {
            remote_client.schedule_layer_file_upload(layer)?;
        }
        // should any new image layer been created, not uploading index_part will
        // result in a mismatch between remote_physical_size and layermap calculated
        // size, which will fail some tests, but should not be an issue otherwise.
        remote_client.schedule_index_upload_for_file_changes()?;
        Ok(())
    }
    /// Update information about which layer files need to be retained on
    /// garbage collection. This is separate from actually performing the GC,
    /// and is updated more frequently, so that compaction can remove obsolete
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -125,18 +125,8 @@ impl Timeline {
                    )
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
                    for layer in layers {
                        remote_client.schedule_layer_file_upload(layer)?;
                    }
                }
-                if let Some(remote_client) = &self.remote_client {
+                self.upload_new_image_layers(layers)?;
                    // should any new image layer been created, not uploading index_part will
                    // result in a mismatch between remote_physical_size and layermap calculated
                    // size, which will fail some tests, but should not be an issue otherwise.
                    remote_client.schedule_index_upload_for_file_changes()?;
                }
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -818,7 +808,10 @@ impl TimelineAdaptor {
        self.timeline
            .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
            .await?;
-        self.new_images.clear();
+
        self.timeline
            .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
        self.new_deltas.clear();
        self.layers_to_delete.clear();
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, Instrument};
+use tracing::{error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 use crate::{
@@ -14,7 +14,6 @@ use crate::{
    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        debug_assert_current_span_has_tenant_and_timeline_id,
        metadata::TimelineMetadata,
        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
        CreateTimelineCause, DeleteTimelineError, Tenant,
@@ -23,58 +22,6 @@ use crate::{
 use super::{Timeline, TimelineResources};
 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
    // Notify any timeline work to drop out of loops/requests
    tracing::debug!("Cancelling CancellationToken");
    timeline.cancel.cancel();
    // Stop the walreceiver first.
    debug!("waiting for wal receiver to shutdown");
    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
    if let Some(walreceiver) = maybe_started_walreceiver {
        walreceiver.stop().await;
    }
    debug!("wal receiver shutdown confirmed");
    // Shut down the layer flush task before the remote client, as one depends on the other
    task_mgr::shutdown_tasks(
        Some(TaskKind::LayerFlushTask),
        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
    )
    .await;
    // Prevent new uploads from starting.
    if let Some(remote_client) = timeline.remote_client.as_ref() {
        remote_client.stop();
    }
    // Stop & wait for the remaining timeline tasks, including upload tasks.
    // NB: This and other delete_timeline calls do not run as a task_mgr task,
    //     so, they are not affected by this shutdown_tasks() call.
    info!("waiting for timeline tasks to shutdown");
    task_mgr::shutdown_tasks(
        None,
        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
    )
    .await;
    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
        Err(anyhow::anyhow!(
            "failpoint: timeline-delete-before-index-deleted-at"
        ))?
    });
    tracing::debug!("Waiting for gate...");
    timeline.gate.close().await;
    tracing::debug!("Shutdown complete");
    Ok(())
 }
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
@@ -268,7 +215,14 @@ impl DeleteTimelineFlow {
        guard.mark_in_progress()?;
-        stop_tasks(&timeline).await?;
+        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
            ))?
        });
        set_deleted_in_remote_index(&timeline).await?;
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -67,20 +67,19 @@ impl Timeline {
            ),
            false,
            async move {
                let cancel = task_mgr::shutdown_token();
                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = self_clone.cancel.cancelled() => { return Ok(()); }
                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                };
-                self_clone.eviction_task(parent, cancel).await;
+                self_clone.eviction_task(parent).await;
                Ok(())
            },
        );
    }
    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
        use crate::tenant::tasks::random_init_delay;
        // acquire the gate guard only once within a useful span
@@ -95,7 +94,7 @@ impl Timeline {
                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
-            if random_init_delay(period, &cancel).await.is_err() {
+            if random_init_delay(period, &self.cancel).await.is_err() {
                return;
            }
        }
@@ -104,13 +103,13 @@ impl Timeline {
        loop {
            let policy = self.get_eviction_policy();
            let cf = self
-                .eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
+                .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
                .await;
            match cf {
                ControlFlow::Break(()) => break,
                ControlFlow::Continue(sleep_until) => {
-                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+                    if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
                        .await
                        .is_ok()
                    {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -120,9 +120,10 @@ impl LayerManager {
    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
    pub(crate) async fn try_freeze_in_memory_layer(
        &mut self,
-        Lsn(last_record_lsn): Lsn,
+        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
    ) {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);
        if let Some(open_layer) = &self.layer_map.open_layer {
@@ -135,8 +136,11 @@ impl LayerManager {
            self.layer_map.frozen_layers.push_back(open_layer_rc);
            self.layer_map.open_layer = None;
            self.layer_map.next_open_layer_at = Some(end_lsn);
            last_freeze_at.store(end_lsn);
        }
        // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
        // accounts for regions in the LSN range where we might have ingested no data due to sharding.
        last_freeze_at.store(end_lsn);
    }
    /// Add image layers to the layer map, called from `create_image_layers`.
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,26 +24,21 @@ mod connection_manager;
 mod walreceiver_connection;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };
 use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
 use tokio::select;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TimelineId;
 use self::connection_manager::ConnectionManagerStatus;
 use super::Timeline;
@@ -62,9 +57,10 @@ pub struct WalReceiverConf {
 }
 pub struct WalReceiver {
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
    /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
    /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
    cancel: CancellationToken,
 }
 impl WalReceiver {
@@ -78,65 +74,58 @@ impl WalReceiver {
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
        let loop_status = Arc::new(std::sync::RwLock::new(None));
        let manager_status = Arc::clone(&loop_status);
-        task_mgr::spawn(
+        let cancel = timeline.cancel.child_token();
-            WALRECEIVER_RUNTIME.handle(),
+        WALRECEIVER_RUNTIME.spawn({
-            TaskKind::WalReceiverManager,
+            let cancel = cancel.clone();
            Some(timeline.tenant_shard_id),
            Some(timeline_id),
            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
            false,
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
                // acquire timeline gate so we know the task doesn't outlive the Timeline
                let Ok(_guard) = timeline.gate.enter() else {
                    debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
                    return;
                };
                debug!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
                    conf,
                    cancel.clone(),
                );
-                loop {
+                while !cancel.is_cancelled() {
-                    select! {
+                    let loop_step_result = connection_manager_loop_step(
-                        _ = task_mgr::shutdown_watcher() => {
+                        &mut broker_client,
-                            trace!("WAL receiver shutdown requested, shutting down");
+                        &mut connection_manager_state,
                        &walreceiver_ctx,
                        &cancel,
                        &loop_status,
                    ).await;
                    match loop_step_result {
                        Ok(()) => continue,
                        Err(_cancelled) => {
                            trace!("Connection manager loop ended, shutting down");
                            break;
-                        },
+                        }
                        loop_step_result = connection_manager_loop_step(
                            &mut broker_client,
                            &mut connection_manager_state,
                            &walreceiver_ctx,
                            &loop_status,
                        ) => match loop_step_result {
                            ControlFlow::Continue(()) => continue,
                            ControlFlow::Break(()) => {
                                trace!("Connection manager loop ended, shutting down");
                                break;
                            }
                        },
                    }
                }
                connection_manager_state.shutdown().await;
                *loop_status.write().unwrap() = None;
-                Ok(())
+                debug!("task exits");
            }
            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
-        );
+        });
        Self {
            tenant_shard_id,
            timeline_id,
            manager_status,
            cancel,
        }
    }
-    pub async fn stop(self) {
+    #[instrument(skip_all, level = tracing::Level::DEBUG)]
-        task_mgr::shutdown_tasks(
+    pub fn cancel(&self) {
-            Some(TaskKind::WalReceiverManager),
+        debug_assert_current_span_has_tenant_and_timeline_id();
-            Some(self.tenant_shard_id),
+        debug!("cancelling walreceiver tasks");
-            Some(self.timeline_id),
+        self.cancel.cancel();
        )
        .await;
    }
    pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -170,14 +159,18 @@ enum TaskStateUpdate<E> {
 impl<E: Clone> TaskHandle<E> {
    /// Initializes the task, starting it immediately after the creation.
    ///
    /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
    /// It being a child token enables us to provide a [`Self::shutdown`] method.
    fn spawn<Fut>(
        cancel_parent: &CancellationToken,
        task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
    ) -> Self
    where
        Fut: Future<Output = anyhow::Result<()>> + Send,
        E: Send + Sync + 'static,
    {
-        let cancellation = CancellationToken::new();
+        let cancellation = cancel_parent.child_token();
        let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
        let cancellation_clone = cancellation.clone();
@@ -197,6 +190,9 @@ impl<E: Clone> TaskHandle<E> {
        }
    }
    /// # Cancel-Safety
    ///
    /// Cancellation-safe.
    async fn next_task_event(&mut self) -> TaskEvent<E> {
        match self.events_receiver.changed().await {
            Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::{shutdown_token, TaskKind};
+use crate::task_mgr::TaskKind;
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::{BrokerClientChannel, Code, Streaming};
-use tokio::select;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use postgres_connection::PgConnectionConfig;
@@ -45,27 +45,33 @@ use super::{
    TaskEvent, TaskHandle,
 };
 pub(crate) struct Cancelled;
 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
 /// If storage broker subscription is cancelled, exits.
 ///
 /// # Cancel-Safety
 ///
 /// Not cancellation-safe. Use `cancel` token to request cancellation.
 pub(super) async fn connection_manager_loop_step(
    broker_client: &mut BrokerClientChannel,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
    cancel: &CancellationToken,
    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
-) -> ControlFlow<(), ()> {
+) -> Result<(), Cancelled> {
-    match connection_manager_state
+    match tokio::select! {
-        .timeline
+        _ = cancel.cancelled() => { return Err(Cancelled); },
-        .wait_to_become_active(ctx)
+        st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
-        .await
+    } {
    {
        Ok(()) => {}
        Err(new_state) => {
            debug!(
                ?new_state,
                "state changed, stopping wal connection manager loop"
            );
-            return ControlFlow::Break(());
+            return Err(Cancelled);
        }
    }
@@ -86,7 +92,7 @@ pub(super) async fn connection_manager_loop_step(
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
-    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
+    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
    debug!("Subscribed for broker timeline updates");
    loop {
@@ -94,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
        // These things are happening concurrently:
        //
        // - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -101,7 +108,11 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
-        select! {
+
        // NB: make sure each of the select expressions are cancellation-safe
        // (no need for arms to be cancellation-safe).
        tokio::select! {
            _ = cancel.cancelled() => { return Err(Cancelled); }
            Some(wal_connection_update) = async {
                match connection_manager_state.wal_connection.as_mut() {
                    Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -133,7 +144,7 @@ pub(super) async fn connection_manager_loop_step(
            },
            // Got a new update from the broker
-            broker_update = broker_subscription.message() => {
+            broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                    Err(status) => {
@@ -147,16 +158,17 @@ pub(super) async fn connection_manager_loop_step(
                                warn!("broker subscription failed: {status}");
                            }
                        }
-                        return ControlFlow::Continue(());
+                        return Ok(());
                    }
                    Ok(None) => {
                        error!("broker subscription stream ended"); // can't happen
-                        return ControlFlow::Continue(());
+                        return Ok(());
                    }
                }
            },
            new_event = async {
                // Reminder: this match arm needs to be cancellation-safe.
                loop {
                    if connection_manager_state.timeline.current_state() == TimelineState::Loading {
                        warn!("wal connection manager should only be launched after timeline has become active");
@@ -182,11 +194,11 @@ pub(super) async fn connection_manager_loop_step(
                }
            } => match new_event {
                ControlFlow::Continue(()) => {
-                    return ControlFlow::Continue(());
+                    return Ok(());
                }
                ControlFlow::Break(()) => {
                    debug!("Timeline is no longer active, stopping wal connection manager loop");
-                    return ControlFlow::Break(());
+                    return Err(Cancelled);
                }
            },
@@ -218,16 +230,15 @@ pub(super) async fn connection_manager_loop_step(
 async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
-) -> Streaming<SafekeeperTimelineInfo> {
+    cancel: &CancellationToken,
 ) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
    let mut attempt = 0;
    let cancel = shutdown_token();
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel,
+            cancel,
        )
        .await;
        attempt += 1;
@@ -241,9 +252,14 @@ async fn subscribe_for_timeline_updates(
            subscription_key: Some(key),
        };
-        match broker_client.subscribe_safekeeper_info(request).await {
+        match {
            tokio::select! {
                r = broker_client.subscribe_safekeeper_info(request) => { r }
                _ = cancel.cancelled() => { return Err(Cancelled); }
            }
        } {
            Ok(resp) => {
-                return resp.into_inner();
+                return Ok(resp.into_inner());
            }
            Err(e) => {
                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
@@ -264,6 +280,8 @@ pub(super) struct ConnectionManagerState {
    id: TenantTimelineId,
    /// Use pageserver data about the timeline to filter out some of the safekeepers.
    timeline: Arc<Timeline>,
    /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
    cancel: CancellationToken,
    conf: WalReceiverConf,
    /// Current connection to safekeeper for WAL streaming.
    wal_connection: Option<WalConnection>,
@@ -386,7 +404,11 @@ struct BrokerSkTimeline {
 }
 impl ConnectionManagerState {
-    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
+    pub(super) fn new(
        timeline: Arc<Timeline>,
        conf: WalReceiverConf,
        cancel: CancellationToken,
    ) -> Self {
        let id = TenantTimelineId {
            tenant_id: timeline.tenant_shard_id.tenant_id,
            timeline_id: timeline.timeline_id,
@@ -394,6 +416,7 @@ impl ConnectionManagerState {
        Self {
            id,
            timeline,
            cancel,
            conf,
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
@@ -401,6 +424,22 @@ impl ConnectionManagerState {
        }
    }
    fn spawn<Fut>(
        &self,
        task: impl FnOnce(
                tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
                CancellationToken,
            ) -> Fut
            + Send
            + 'static,
    ) -> TaskHandle<WalConnectionStatus>
    where
        Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
    {
        // TODO: get rid of TaskHandle
        super::TaskHandle::spawn(&self.cancel, task)
    }
    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
        WALRECEIVER_SWITCHES
@@ -419,7 +458,7 @@ impl ConnectionManagerState {
        );
        let span = info_span!("connection", %node_id);
-        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
+        let connection_handle = self.spawn(move |events_sender, cancellation| {
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
@@ -447,6 +486,12 @@ impl ConnectionManagerState {
                                info!("walreceiver connection handling ended: {e}");
                                Ok(())
                            }
                            WalReceiverError::ClosedGate => {
                                info!(
                                    "walreceiver connection handling ended because of closed gate"
                                );
                                Ok(())
                            }
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
                                if cancellation.is_cancelled() {
@@ -486,6 +531,10 @@ impl ConnectionManagerState {
    /// Drops the current connection (if any) and updates retry timeout for the next
    /// connection attempt to the same safekeeper.
    ///
    /// # Cancel-Safety
    ///
    /// Not cancellation-safe.
    async fn drop_old_connection(&mut self, needs_shutdown: bool) {
        let wal_connection = match self.wal_connection.take() {
            Some(wal_connection) => wal_connection,
@@ -493,7 +542,14 @@ impl ConnectionManagerState {
        };
        if needs_shutdown {
-            wal_connection.connection_task.shutdown().await;
+            wal_connection
                .connection_task
                .shutdown()
                // This here is why this function isn't cancellation-safe.
                // If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
                // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
                // and thus be ineffective.
                .await;
        }
        let retry = self
@@ -838,6 +894,9 @@ impl ConnectionManagerState {
        }
    }
    /// # Cancel-Safety
    ///
    /// Not cancellation-safe.
    pub(super) async fn shutdown(mut self) {
        if let Some(wal_connection) = self.wal_connection.take() {
            wal_connection.connection_task.shutdown().await;
@@ -986,7 +1045,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1154,7 +1213,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1221,7 +1280,7 @@ mod tests {
            sk_id: NodeId(1),
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1285,7 +1344,7 @@ mod tests {
            sk_id: NodeId(1),
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
+            connection_task: state.spawn(move |_, _| async move { Ok(()) }),
            discovered_new_wal: Some(NewCommittedWAL {
                discovered_at: time_over_threshold,
                lsn: new_lsn,
@@ -1341,6 +1400,7 @@ mod tests {
                timeline_id: TIMELINE_ID,
            },
            timeline,
            cancel: CancellationToken::new(),
            conf: WalReceiverConf {
                wal_connect_timeout: Duration::from_secs(1),
                lagging_wal_timeout: Duration::from_secs(1),
@@ -1384,7 +1444,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -37,8 +36,8 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use utils::pageserver_feedback::PageserverFeedback;
 use utils::{id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
    SuccessfulCompletion(String),
    /// Generic error
    Other(anyhow::Error),
    ClosedGate,
 }
 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
    // prevent timeline shutdown from finishing until we have exited
    let _guard = timeline.gate.enter().map_err(|e| match e {
        GateError::GateClosed => WalReceiverError::ClosedGate,
    })?;
    // This function spawns a side-car task (WalReceiverConnectionPoller).
    // Get its gate guard now as well.
    let poller_guard = timeline.gate.enter().map_err(|e| match e {
        GateError::GateClosed => WalReceiverError::ClosedGate,
    })?;
    WALRECEIVER_STARTED_CONNECTIONS.inc();
    // Connect to the database in replication mode.
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
    }
    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
+    // so spawn it off to run on its own. It shouldn't outlive this function, but,
    // due to lack of async drop, we can't enforce that. However, we ensure that
    // 1. it is sensitive to `cancellation` and
    // 2. holds the Timeline gate open so that after timeline shutdown,
    //    we know this task is gone.
    let _connection_ctx = ctx.detached_child(
        TaskKind::WalReceiverConnectionPoller,
        ctx.download_behavior(),
    );
    let connection_cancellation = cancellation.clone();
-    task_mgr::spawn(
+    WALRECEIVER_RUNTIME.spawn(
        WALRECEIVER_RUNTIME.handle(),
        TaskKind::WalReceiverConnectionPoller,
        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
        "walreceiver connection",
        false,
        async move {
            debug_assert_current_span_has_tenant_and_timeline_id();
            select! {
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
                                // with a similar error.
                            },
                            WalReceiverError::SuccessfulCompletion(_) => {}
                            WalReceiverError::ClosedGate => {
                                // doesn't happen at runtime
                            }
                            WalReceiverError::Other(err) => {
                                warn!("Connection aborted: {err:#}")
                            }
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
                },
                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
            }
-            Ok(())
+            drop(poller_guard);
        }
        // Enrich the log lines emitted by this closure with meaningful context.
        // TODO: technically, this task outlives the surrounding function, so, the
@@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(
                trace!("received XLogData between {startlsn} and {endlsn}");
                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                waldecoder.feed_bytes(data);
                {
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,7 +61,7 @@ pub struct VectoredRead {
 }
 impl VectoredRead {
-    fn size(&self) -> usize {
+    pub fn size(&self) -> usize {
        (self.end - self.start) as usize
    }
 }
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
        .map_err(std::io::Error::from)
        .context("statvfs tenants directory")?;
-    let blocksz = statvfs.block_size();
+    // https://unix.stackexchange.com/a/703650
    let blocksz = if statvfs.fragment_size() > 0 {
        statvfs.fragment_size()
    } else {
        statvfs.block_size()
    };
    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
    let free = statvfs.blocks_available() as u64 * blocksz;
-    let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
+
    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
    let used = statvfs
        .blocks()
        // use blocks_free instead of available here to match df in case someone compares
        .saturating_sub(statvfs.blocks_free()) as u64
        * blocksz;
    let captured_at = std::time::SystemTime::now();
    let doc = PageserverUtilization {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
 ///
 /// This is the real implementation that uses a Postgres process to
@@ -53,7 +54,19 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
+    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
    /// their process object; we use [`Arc::clone`] for that.
    /// This is primarily because earlier implementations that didn't  use [`heavier_once_cell`]
    /// had that behavior; it's probably unnecessary.
    /// The only merit of it is that if one walredo process encounters an error,
    /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
    /// and retry redo, thereby starting the new process, while other redo tasks might
    /// still be using the old redo process. But, those other tasks will most likely
    /// encounter an error as well, and errors are an unexpected condition anyway.
    /// So, probably we could get rid of the `Arc` in the future.
    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
 }
 ///
@@ -101,6 +114,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
                    .await
                };
                img = Some(result?);
@@ -121,6 +135,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
            .await
        }
    }
@@ -134,7 +149,7 @@ impl PostgresRedoManager {
                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                })
            },
-            pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
+            pid: self.redo_process.get().map(|p| p.id()),
        })
    }
 }
@@ -152,7 +167,7 @@ impl PostgresRedoManager {
            tenant_shard_id,
            conf,
            last_redo_at: std::sync::Mutex::default(),
-            redo_process: RwLock::new(None),
+            redo_process: heavier_once_cell::OnceCell::default(),
        }
    }
@@ -164,8 +179,7 @@ impl PostgresRedoManager {
            if let Some(last_redo_at) = *g {
                if last_redo_at.elapsed() >= idle_timeout {
                    drop(g);
-                    let mut guard = self.redo_process.write().unwrap();
+                    drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
                    *guard = None;
                }
            }
        }
@@ -174,8 +188,11 @@ impl PostgresRedoManager {
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
    /// # Cancel-Safety
    ///
    /// Cancellation safe.
    #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -191,42 +208,31 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            // launch the WAL redo process on first use
+            let proc: Arc<process::WalRedoProcess> =
-            let proc: Arc<process::WalRedoProcess> = {
+                match self.redo_process.get_or_init_detached().await {
-                let proc_guard = self.redo_process.read().unwrap();
+                    Ok(guard) => Arc::clone(&guard),
-                match &*proc_guard {
+                    Err(permit) => {
-                    None => {
+                        // don't hold poison_guard, the launch code can bail
-                        // "upgrade" to write lock to launch the process
+                        let start = Instant::now();
-                        drop(proc_guard);
+                        let proc = Arc::new(
-                        let mut proc_guard = self.redo_process.write().unwrap();
+                            process::WalRedoProcess::launch(
-                        match &*proc_guard {
+                                self.conf,
-                            None => {
+                                self.tenant_shard_id,
-                                let start = Instant::now();
+                                pg_version,
-                                let proc = Arc::new(
+                            )
-                                    process::WalRedoProcess::launch(
+                            .context("launch walredo process")?,
-                                        self.conf,
+                        );
-                                        self.tenant_shard_id,
+                        let duration = start.elapsed();
-                                        pg_version,
+                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                                    )
+                        info!(
-                                    .context("launch walredo process")?,
+                            duration_ms = duration.as_millis(),
-                                );
+                            pid = proc.id(),
-                                let duration = start.elapsed();
+                            "launched walredo process"
-                                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
+                        );
-                                    .observe(duration.as_secs_f64());
+                        self.redo_process.set(Arc::clone(&proc), permit);
-                                info!(
+                        proc
                                    duration_ms = duration.as_millis(),
                                    pid = proc.id(),
                                    "launched walredo process"
                                );
                                *proc_guard = Some(Arc::clone(&proc));
                                proc
                            }
                            Some(proc) => Arc::clone(proc),
                        }
                    }
-                    Some(proc) => Arc::clone(proc),
+                };
                }
            };
            let started_at = std::time::Instant::now();
@@ -272,34 +278,34 @@ impl PostgresRedoManager {
                    n_attempts,
                    e,
                );
-                // Avoid concurrent callers hitting the same issue.
+                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-                // We can't prevent it from happening because we want to enable parallelism.
+                // Note that there may be other tasks concurrent with us that also hold `proc`.
-                {
+                // We have to deal with that here.
-                    let mut guard = self.redo_process.write().unwrap();
+                // Also read the doc comment on field `self.redo_process`.
-                    match &*guard {
+                //
                        Some(current_field_value) => {
                            if Arc::ptr_eq(current_field_value, &proc) {
                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
                                *guard = None;
                            }
                        }
                        None => {
                            // Another thread was faster to observe the error, and already took the process out of rotation.
                        }
                    }
                }
                // NB: there may still be other concurrent threads using `proc`.
                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
+                //
-                // holding the lock while waiting for the process to exit.
+                // NB: the drop impl blocks the dropping thread with a wait() system call for
-                // NB: the drop impl blocks the current threads with a wait() system call for
+                // the child process. In some ways the blocking is actually good: if we
-                // the child process. We dropped the `guard` above so that other threads aren't
+                // deferred the waiting into the background / to tokio if we used `tokio::process`,
-                // affected. But, it's good that the current thread _does_ block to wait.
+                // it could happen that if walredo always fails immediately, we spawn processes faster
                // If we instead deferred the waiting into the background / to tokio, it could
                // happen that if walredo always fails immediately, we spawn processes faster
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
                match self.redo_process.get() {
                    None => (),
                    Some(guard) => {
                        if Arc::ptr_eq(&proc, &*guard) {
                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
                            guard.take_and_deinit();
                        } else {
                            // Another task already spawned another redo process (further up in this method)
                            // and put it into `redo_process`. Do nothing, our view of the world is behind.
                        }
                    }
                }
                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
                drop(proc);
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,6 +111,7 @@ static PageServer page_servers[MAX_SHARDS];
 static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
 static void pageserver_disconnect_shard(shardno_t shard_no);
 static bool
 PagestoreShmemIsValid(void)
@@ -487,9 +488,32 @@ retry:
 	return ret;
 }
-
+/*
 * Reset prefetch and drop connection to the shard.
 * It also drops connection to all other shards involved in prefetch.
 */
 static void
 pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If the connection to any pageserver is lost, we throw away the
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
 	 *
 	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
 	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();
 	pageserver_disconnect_shard(shard_no);
 }
 /*
 * Disconnect from specified shard
 */
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -503,14 +527,6 @@ pageserver_disconnect(shardno_t shard_no)
 		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
 		PQfinish(page_servers[shard_no].conn);
 		page_servers[shard_no].conn = NULL;
 		/*
 		 * If the connection to any pageserver is lost, we throw away the
 		 * whole prefetch queue, even for other pageservers. It should not
 		 * cause big problems, because connection loss is supposed to be a
 		 * rare event.
 		 */
 		prefetch_on_ps_disconnect();
 	}
 	if (page_servers[shard_no].wes != NULL)
 	{
@@ -676,7 +692,8 @@ page_server_api api =
 {
 	.send = pageserver_send,
 	.flush = pageserver_flush,
-	.receive = pageserver_receive
+	.receive = pageserver_receive,
 	.disconnect = pageserver_disconnect_shard
 };
 static bool
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -180,6 +180,7 @@ typedef struct
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
 	NeonResponse *(*receive) (shardno_t shard_no);
 	bool		(*flush) (shardno_t shard_no);
 	void        (*disconnect) (shardno_t shard_no);
 } page_server_api;
 extern void prefetch_on_ps_disconnect(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -613,6 +613,14 @@ prefetch_on_ps_disconnect(void)
 		Assert(slot->status == PRFS_REQUESTED);
 		Assert(slot->my_ring_index == ring_index);
 		/*
 		 * Drop connection to all shards which have prefetch requests.
 		 * It is not a problem to call disconnect multiple times on the same connection
 		 * because disconnect implementation in libpagestore.c will check if connection
 		 * is alive and do nothing of connection was already dropped.
 		 */
 		page_server->disconnect(slot->shard_no);
 		/* clean up the request */
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
@@ -633,13 +641,12 @@ prefetch_on_ps_disconnect(void)
 static inline void
 prefetch_set_unused(uint64 ring_index)
 {
-	PrefetchRequest *slot = GetPrfSlot(ring_index);
+	PrefetchRequest *slot;
 	if (ring_index < MyPState->ring_last)
 		return;					/* Should already be unused */
-	Assert(MyPState->ring_unused > ring_index);
+	slot = GetPrfSlot(ring_index);
 	if (slot->status == PRFS_UNUSED)
 		return;
@@ -798,7 +805,8 @@ Retry:
 			{
 				if (*force_lsn > slot->effective_request_lsn)
 				{
-					prefetch_wait_for(ring_index);
+					if (!prefetch_wait_for(ring_index))
 						goto Retry;
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -813,7 +821,8 @@ Retry:
 			{
 				if (*force_lsn != slot->effective_request_lsn)
 				{
-					prefetch_wait_for(ring_index);
+					if (!prefetch_wait_for(ring_index))
 						goto Retry;
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -879,7 +888,8 @@ Retry:
 			{
 				case PRFS_REQUESTED:
 					Assert(MyPState->ring_receive == cleanup_index);
-					prefetch_wait_for(cleanup_index);
+					if (!prefetch_wait_for(cleanup_index))
 						goto Retry;
 					prefetch_set_unused(cleanup_index);
 					break;
 				case PRFS_RECEIVED:
@@ -1680,7 +1690,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2132,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 	if (entry != NULL)
@@ -2153,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
-				prefetch_wait_for(slot->my_ring_index);
+				if (!prefetch_wait_for(slot->my_ring_index))
 					goto Retry;
 			}
 			/* drop caches */
 			prefetch_set_unused(slot->my_ring_index);
@@ -2216,7 +2228,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
 	}
 	/* buffer was used, clean up for later reuse */
@@ -2489,7 +2501,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
@@ -2544,7 +2556,7 @@ neon_dbsize(Oid dbNode)
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
 	}
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2849,7 +2861,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
 	}
 	pfree(resp);
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -10,6 +10,7 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,8 +102,7 @@ pub(super) async fn authenticate(
    ctx.set_user(db_info.user.into());
    ctx.set_project(db_info.aux.clone());
-    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
+    info!("woken up a compute node");
    info!(?cold_start_info, "woken up a compute node");
    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
    // while direct connections do not. Once we migrate to pg_sni_proxy
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,6 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
 use proxy::{BranchId, EndpointId, ProjectId};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
@@ -269,7 +270,12 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;
-    let metrics_aux: MetricsAuxInfo = Default::default();
+    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
        endpoint_id: (&EndpointId::from("")).into(),
        project_id: (&ProjectId::from("")).into(),
        branch_id: (&BranchId::from("")).into(),
        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
    };
    // doesn't yet matter as pg-sni-router doesn't report analytics logs
    ctx.set_success();
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -10,6 +10,7 @@ use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
 use proxy::config::remote_storage_from_toml;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -191,6 +192,19 @@ struct ProxyCliArgs {
    #[clap(flatten)]
    parquet_upload: ParquetUploadArgs,
    /// interval for backup metric collection
    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
    metric_backup_collection_interval: std::time::Duration,
    /// remote storage configuration for backup metric collection
    /// Encoded as toml (same format as pageservers), eg
    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
    #[clap(long, default_value = "{}")]
    metric_backup_collection_remote_storage: String,
    /// chunk size for backup metric collection
    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
    #[clap(long, default_value = "4194304")]
    metric_backup_collection_chunk_size: usize,
 }
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -372,12 +386,17 @@ async fn main() -> anyhow::Result<()> {
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
    if let Some(metrics_config) = &config.metric_collection {
        // TODO: Add gc regardles of the metric collection being enabled.
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
        client_tasks.spawn(usage_metrics::task_backup(
            &metrics_config.backup_metric_collection_config,
            cancellation_token,
        ));
    }
    if let auth::BackendType::Console(api, _) = &config.auth_backend {
@@ -434,6 +453,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    if args.allow_self_signed_compute {
        warn!("allowing self-signed compute certificates");
    }
    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
        interval: args.metric_backup_collection_interval,
        remote_storage_config: remote_storage_from_toml(
            &args.metric_backup_collection_remote_storage,
        )?,
        chunk_size: args.metric_backup_collection_chunk_size,
    };
    let metric_collection = match (
        &args.metric_collection_endpoint,
@@ -442,6 +468,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
            endpoint: endpoint.parse()?,
            interval: humantime::parse_duration(interval)?,
            backup_metric_collection_config,
        }),
        (None, None) => None,
        _ => bail!(
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -16,7 +16,7 @@ use crate::{
    config::ProjectInfoCacheOptions,
    console::AuthSecret,
    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointId, ProjectId, RoleName,
+    EndpointId, RoleName,
 };
 use super::{Cache, Cached};
@@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl {
    }
    pub fn insert_role_secret(
        &self,
-        project_id: &ProjectId,
+        project_id: ProjectIdInt,
-        endpoint_id: &EndpointId,
+        endpoint_id: EndpointIdInt,
-        role_name: &RoleName,
+        role_name: RoleNameInt,
        secret: Option<AuthSecret>,
    ) {
        let project_id = ProjectIdInt::from(project_id);
        let endpoint_id = EndpointIdInt::from(endpoint_id);
        let role_name = RoleNameInt::from(role_name);
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
            return;
@@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl {
    }
    pub fn insert_allowed_ips(
        &self,
-        project_id: &ProjectId,
+        project_id: ProjectIdInt,
-        endpoint_id: &EndpointId,
+        endpoint_id: EndpointIdInt,
        allowed_ips: Arc<Vec<IpPattern>>,
    ) {
        let project_id = ProjectIdInt::from(project_id);
        let endpoint_id = EndpointIdInt::from(endpoint_id);
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
            return;
@@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::scram::ServerSecret;
+    use crate::{scram::ServerSecret, ProjectId};
    #[tokio::test]
    async fn test_project_info_cache_settings() {
@@ -369,8 +364,8 @@ mod tests {
            ttl: Duration::from_secs(1),
            gc_interval: Duration::from_secs(600),
        });
-        let project_id = "project".into();
+        let project_id: ProjectId = "project".into();
-        let endpoint_id = "endpoint".into();
+        let endpoint_id: EndpointId = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -379,9 +374,23 @@ mod tests {
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
        ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
+        cache.insert_role_secret(
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
+            (&project_id).into(),
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+            (&endpoint_id).into(),
            (&user1).into(),
            secret1.clone(),
        );
        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user2).into(),
            secret2.clone(),
        );
        cache.insert_allowed_ips(
            (&project_id).into(),
            (&endpoint_id).into(),
            allowed_ips.clone(),
        );
        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
        assert!(cached.cached());
@@ -393,7 +402,12 @@ mod tests {
        // Shouldn't add more than 2 roles.
        let user3: RoleName = "user3".into();
        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
-        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
+        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user3).into(),
            secret3.clone(),
        );
        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
@@ -421,8 +435,8 @@ mod tests {
        cache.clone().disable_ttl();
        tokio::time::advance(Duration::from_secs(2)).await;
-        let project_id = "project".into();
+        let project_id: ProjectId = "project".into();
-        let endpoint_id = "endpoint".into();
+        let endpoint_id: EndpointId = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -431,9 +445,23 @@ mod tests {
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
        ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
+        cache.insert_role_secret(
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
+            (&project_id).into(),
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+            (&endpoint_id).into(),
            (&user1).into(),
            secret1.clone(),
        );
        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user2).into(),
            secret2.clone(),
        );
        cache.insert_allowed_ips(
            (&project_id).into(),
            (&endpoint_id).into(),
            allowed_ips.clone(),
        );
        tokio::time::advance(Duration::from_secs(2)).await;
        // Nothing should be invalidated.
@@ -470,8 +498,8 @@ mod tests {
            gc_interval: Duration::from_secs(600),
        }));
-        let project_id = "project".into();
+        let project_id: ProjectId = "project".into();
-        let endpoint_id = "endpoint".into();
+        let endpoint_id: EndpointId = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -480,10 +508,20 @@ mod tests {
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
        ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
+        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user1).into(),
            secret1.clone(),
        );
        cache.clone().disable_ttl();
        tokio::time::advance(Duration::from_millis(100)).await;
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
+        cache.insert_role_secret(
            (&project_id).into(),
            (&endpoint_id).into(),
            (&user2).into(),
            secret2.clone(),
        );
        // Added before ttl was disabled + ttl should be still cached.
        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
@@ -497,7 +535,11 @@ mod tests {
        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
        // Added after ttl was disabled + ttl should not be cached.
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_allowed_ips(
            (&project_id).into(),
            (&endpoint_id).into(),
            allowed_ips.clone(),
        );
        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
        assert!(!cached.cached());
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -276,6 +276,7 @@ impl ConnCfg {
        let stream = connection.stream.into_inner();
        info!(
            cold_start_info = ctx.cold_start_info.as_str(),
            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
            self.0.get_ssl_mode()
        );
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -5,6 +5,7 @@ use crate::{
 };
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
 use rustls::{
    crypto::ring::sign,
    pki_types::{CertificateDer, PrivateKeyDer},
@@ -39,6 +40,7 @@ pub struct ProxyConfig {
 pub struct MetricCollectionConfig {
    pub endpoint: reqwest::Url,
    pub interval: Duration,
    pub backup_metric_collection_config: MetricBackupCollectionConfig,
 }
 pub struct TlsConfig {
@@ -311,6 +313,21 @@ impl CertResolver {
    }
 }
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
    pub interval: Duration,
    pub remote_storage_config: OptRemoteStorageConfig,
    pub chunk_size: usize,
 }
 /// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
 /// runtime type errors from the value parser we use.
 pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
 pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
    RemoteStorageConfig::from_toml(&s.parse()?)
 }
 /// Helper for cmdline cache options parsing.
 #[derive(Debug)]
 pub struct CacheOptions {
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -3,7 +3,7 @@ use std::fmt;
 use crate::auth::IpPattern;
-use crate::{BranchId, EndpointId, ProjectId};
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -18,7 +18,7 @@ pub struct ConsoleError {
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
    pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<ProjectId>,
+    pub project_id: Option<ProjectIdInt>,
 }
 // Manually implement debug to omit sensitive info.
@@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo {
 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
-#[derive(Debug, Deserialize, Clone, Default)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: EndpointId,
+    pub endpoint_id: EndpointIdInt,
-    pub project_id: ProjectId,
+    pub project_id: ProjectIdInt,
-    pub branch_id: BranchId,
+    pub branch_id: BranchIdInt,
-    pub cold_start_info: Option<ColdStartInfo>,
+    #[serde(default)]
    pub cold_start_info: ColdStartInfo,
 }
-#[derive(Debug, Default, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
    #[default]
-    Unknown = 0,
+    Unknown,
-    Warm = 1,
+    /// Compute was already running
-    PoolHit = 2,
+    Warm,
-    PoolMiss = 3,
+    #[serde(rename = "pool_hit")]
    /// Compute was not running but there was an available VM
    VmPoolHit,
    #[serde(rename = "pool_miss")]
    /// Compute was not running and there were no VMs available
    VmPoolMiss,
    // not provided by control plane
    /// Connection available from HTTP pool
    HttpPoolHit,
    /// Cached connection info
    WarmCached,
 }
 impl ColdStartInfo {
    pub fn as_str(&self) -> &'static str {
        match self {
            ColdStartInfo::Unknown => "unknown",
            ColdStartInfo::Warm => "warm",
            ColdStartInfo::VmPoolHit => "pool_hit",
            ColdStartInfo::VmPoolMiss => "pool_miss",
            ColdStartInfo::HttpPoolHit => "http_pool_hit",
            ColdStartInfo::WarmCached => "warm_cached",
        }
    }
 }
 #[cfg(test)]
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -12,7 +12,8 @@ use crate::{
    compute,
    config::{CacheOptions, ProjectInfoCacheOptions},
    context::RequestMonitoring,
-    scram, EndpointCacheKey, ProjectId,
+    intern::ProjectIdInt,
    scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
@@ -271,7 +272,7 @@ pub struct AuthInfo {
    /// List of IP addresses allowed for the autorization.
    pub allowed_ips: Vec<IpPattern>,
    /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<ProjectId>,
+    pub project_id: Option<ProjectIdInt>,
 }
 /// Info for establishing a connection to a compute node.
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,10 +4,16 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
 use crate::{
    console::{
        messages::MetricsAuxInfo,
        provider::{CachedAllowedIps, CachedRoleSecret},
    },
    BranchId, EndpointId, ProjectId,
 };
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -114,7 +120,12 @@ impl Api {
        let node = NodeInfo {
            config,
-            aux: Default::default(),
+            aux: MetricsAuxInfo {
                endpoint_id: (&EndpointId::from("endpoint")).into(),
                project_id: (&ProjectId::from("project")).into(),
                branch_id: (&BranchId::from("branch")).into(),
                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
            },
            allow_self_signed_compute: false,
        };
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -181,15 +181,16 @@ impl super::Api for Api {
        }
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
        if let Some(project_id) = auth_info.project_id {
            let ep_int = ep.into();
            self.caches.project_info.insert_role_secret(
-                &project_id,
+                project_id,
-                ep,
+                ep_int,
-                user,
+                user.into(),
                auth_info.secret.clone(),
            );
            self.caches.project_info.insert_allowed_ips(
-                &project_id,
+                project_id,
-                ep,
+                ep_int,
                Arc::new(auth_info.allowed_ips),
            );
            ctx.set_project_id(project_id);
@@ -217,15 +218,16 @@ impl super::Api for Api {
        let allowed_ips = Arc::new(auth_info.allowed_ips);
        let user = &user_info.user;
        if let Some(project_id) = auth_info.project_id {
            let ep_int = ep.into();
            self.caches.project_info.insert_role_secret(
-                &project_id,
+                project_id,
-                ep,
+                ep_int,
-                user,
+                user.into(),
                auth_info.secret.clone(),
            );
            self.caches
                .project_info
-                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
+                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
            ctx.set_project_id(project_id);
        }
        Ok((
@@ -248,8 +250,7 @@ impl super::Api for Api {
        // which means that we might cache it to reduce the load and latency.
        if let Some(cached) = self.caches.node_info.get(&key) {
            info!(key = &*key, "found cached compute node info");
-            info!("cold_start_info=warm");
+            ctx.set_project(cached.aux.clone());
            ctx.set_cold_start_info(ColdStartInfo::Warm);
            return Ok(cached);
        }
@@ -260,17 +261,21 @@ impl super::Api for Api {
        if permit.should_check_cache() {
            if let Some(cached) = self.caches.node_info.get(&key) {
                info!(key = &*key, "found cached compute node info");
-                info!("cold_start_info=warm");
+                ctx.set_project(cached.aux.clone());
                ctx.set_cold_start_info(ColdStartInfo::Warm);
                return Ok(cached);
            }
        }
-        let node = self.do_wake_compute(ctx, user_info).await?;
+        let mut node = self.do_wake_compute(ctx, user_info).await?;
        ctx.set_project(node.aux.clone());
-        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
+        let cold_start_info = node.aux.cold_start_info;
-        info!(?cold_start_info, "woken up a compute node");
+        info!("woken up a compute node");
-        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
+
        // store the cached node as 'warm'
        node.aux.cold_start_info = ColdStartInfo::WarmCached;
        let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
        cached.aux.cold_start_info = cold_start_info;
        info!(key = &*key, "created a cache entry for compute node info");
        Ok(cached)
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,8 +11,9 @@ use uuid::Uuid;
 use crate::{
    console::messages::{ColdStartInfo, MetricsAuxInfo},
    error::ErrorKind,
    intern::{BranchIdInt, ProjectIdInt},
    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, DbName, EndpointId, ProjectId, RoleName,
+    DbName, EndpointId, RoleName,
 };
 use self::parquet::RequestData;
@@ -34,8 +35,8 @@ pub struct RequestMonitoring {
    pub span: Span,
    // filled in as they are discovered
-    project: Option<ProjectId>,
+    project: Option<ProjectIdInt>,
-    branch: Option<BranchId>,
+    branch: Option<BranchIdInt>,
    endpoint_id: Option<EndpointId>,
    dbname: Option<DbName>,
    user: Option<RoleName>,
@@ -43,7 +44,7 @@ pub struct RequestMonitoring {
    error_kind: Option<ErrorKind>,
    pub(crate) auth_method: Option<AuthMethod>,
    success: bool,
-    cold_start_info: Option<ColdStartInfo>,
+    pub(crate) cold_start_info: ColdStartInfo,
    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -92,7 +93,7 @@ impl RequestMonitoring {
            error_kind: None,
            auth_method: None,
            success: false,
-            cold_start_info: None,
+            cold_start_info: ColdStartInfo::Unknown,
            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
@@ -113,26 +114,31 @@ impl RequestMonitoring {
    }
    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
-        self.cold_start_info = Some(info);
+        self.cold_start_info = info;
        self.latency_timer.cold_start_info(info);
    }
    pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        self.set_endpoint_id(x.endpoint_id);
+        if self.endpoint_id.is_none() {
            self.set_endpoint_id(x.endpoint_id.as_str().into())
        }
        self.branch = Some(x.branch_id);
        self.project = Some(x.project_id);
-        self.cold_start_info = x.cold_start_info;
+        self.set_cold_start_info(x.cold_start_info);
    }
-    pub fn set_project_id(&mut self, project_id: ProjectId) {
+    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
        self.project = Some(project_id);
    }
    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
-        self.span.record("ep", display(&endpoint_id));
+        if self.endpoint_id.is_none() {
-        crate::metrics::CONNECTING_ENDPOINTS
+            self.span.record("ep", display(&endpoint_id));
-            .with_label_values(&[self.protocol])
+            crate::metrics::CONNECTING_ENDPOINTS
-            .measure(&endpoint_id);
+                .with_label_values(&[self.protocol])
-        self.endpoint_id = Some(endpoint_id);
+                .measure(&endpoint_id);
            self.endpoint_id = Some(endpoint_id);
        }
    }
    pub fn set_application(&mut self, app: Option<SmolStr>) {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,12 +13,14 @@ use parquet::{
    },
    record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
 use super::{RequestMonitoring, LOG_CHAN};
 #[derive(clap::Args, Clone, Debug)]
@@ -50,21 +52,13 @@ pub struct ParquetUploadArgs {
    parquet_upload_compression: Compression,
 }
 /// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
 /// runtime type errors from the value parser we use.
 type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
 fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
    RemoteStorageConfig::from_toml(&s.parse()?)
 }
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a upload fails, we log it at info-level, and retry.
 // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_UPLOAD_RETRIES times, we give up
-pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
-pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 // the parquet crate leaves a lot to be desired...
 // what follows is an attempt to write parquet files with minimal allocs.
@@ -93,7 +87,7 @@ pub struct RequestData {
    /// Or if we make it to proxy_pass
    success: bool,
    /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<&'static str>,
+    cold_start_info: &'static str,
    /// Tracks time from session start (HTTP request/libpq TCP handshake)
    /// Through to success/failure
    duration_us: u64,
@@ -121,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData {
            region: value.region,
            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
            success: value.success,
-            cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
+            cold_start_info: value.cold_start_info.as_str(),
                crate::console::messages::ColdStartInfo::Unknown => "unknown",
                crate::console::messages::ColdStartInfo::Warm => "warm",
                crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
                crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
            }),
            duration_us: SystemTime::from(value.first_packet)
                .elapsed()
                .unwrap_or_default()
@@ -460,7 +449,7 @@ mod tests {
            region: "us-east-1",
            error: None,
            success: rng.gen(),
-            cold_start_info: Some("no"),
+            cold_start_info: "no",
            duration_us: rng.gen_range(0..30_000_000),
        }
    }
@@ -530,15 +519,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1314406, 3, 6000),
+                (1314385, 3, 6000),
-                (1314399, 3, 6000),
+                (1314378, 3, 6000),
-                (1314459, 3, 6000),
+                (1314438, 3, 6000),
-                (1314416, 3, 6000),
+                (1314395, 3, 6000),
-                (1314546, 3, 6000),
+                (1314525, 3, 6000),
-                (1314388, 3, 6000),
+                (1314367, 3, 6000),
-                (1314180, 3, 6000),
+                (1314159, 3, 6000),
-                (1314416, 3, 6000),
+                (1314395, 3, 6000),
-                (438359, 1, 2000)
+                (438352, 1, 2000)
            ]
        );
@@ -568,11 +557,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1220668, 5, 10000),
+                (1220633, 5, 10000),
-                (1226818, 5, 10000),
+                (1226783, 5, 10000),
-                (1228612, 5, 10000),
+                (1228577, 5, 10000),
-                (1227974, 5, 10000),
+                (1227939, 5, 10000),
-                (1219252, 5, 10000)
+                (1219217, 5, 10000)
            ]
        );
@@ -604,11 +593,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1206315, 5, 10000),
+                (1206280, 5, 10000),
-                (1206046, 5, 10000),
+                (1206011, 5, 10000),
-                (1206339, 5, 10000),
+                (1206304, 5, 10000),
-                (1206327, 5, 10000),
+                (1206292, 5, 10000),
-                (1206582, 5, 10000)
+                (1206547, 5, 10000)
            ]
        );
@@ -633,15 +622,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1314406, 3, 6000),
+                (1314385, 3, 6000),
-                (1314399, 3, 6000),
+                (1314378, 3, 6000),
-                (1314459, 3, 6000),
+                (1314438, 3, 6000),
-                (1314416, 3, 6000),
+                (1314395, 3, 6000),
-                (1314546, 3, 6000),
+                (1314525, 3, 6000),
-                (1314388, 3, 6000),
+                (1314367, 3, 6000),
-                (1314180, 3, 6000),
+                (1314159, 3, 6000),
-                (1314416, 3, 6000),
+                (1314395, 3, 6000),
-                (438359, 1, 2000)
+                (438352, 1, 2000)
            ]
        );
@@ -678,7 +667,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
+            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
        );
        tmpdir.close().unwrap();
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -12,6 +12,8 @@ use metrics::{
 use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
 use crate::console::messages::ColdStartInfo;
 pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
    register_int_counter_pair_vec!(
        "proxy_opened_db_connections_total",
@@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
        "proxy_compute_connection_latency_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 2 * 2 * 2 * 2 = 48 counters
+        // 3 * 6 * 2 * 2 = 72 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
+        &["protocol", "cold_start_info", "outcome", "excluded"],
        // largest bucket = 2^16 * 0.5ms = 32s
        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
@@ -117,12 +119,15 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
    .unwrap()
 });
-pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
+pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram!(
+    register_histogram_vec!(
        "proxy_http_conn_content_length_bytes",
-        "Time it took for proxy to establish a connection to the compute endpoint",
+        "Number of bytes the HTTP response content consumes",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
+        // request/response
-        exponential_buckets(8.0, 2.0, 20).unwrap()
+        &["direction"],
        // smallest bucket = 16 bytes
        // largest bucket = 4^12 * 16 bytes = 256MB
        exponential_buckets(16.0, 4.0, 12).unwrap()
    )
    .unwrap()
 });
@@ -180,6 +185,20 @@ struct Accumulated {
    compute: time::Duration,
 }
 enum Outcome {
    Success,
    Failed,
 }
 impl Outcome {
    fn as_str(&self) -> &'static str {
        match self {
            Outcome::Success => "success",
            Outcome::Failed => "failed",
        }
    }
 }
 pub struct LatencyTimer {
    // time since the stopwatch was started
    start: time::Instant,
@@ -189,9 +208,8 @@ pub struct LatencyTimer {
    accumulated: Accumulated,
    // label data
    protocol: &'static str,
-    cache_miss: bool,
+    cold_start_info: ColdStartInfo,
-    pool_miss: bool,
+    outcome: Outcome,
    outcome: &'static str,
 }
 pub struct LatencyTimerPause<'a> {
@@ -207,11 +225,9 @@ impl LatencyTimer {
            stop: None,
            accumulated: Accumulated::default(),
            protocol,
-            cache_miss: false,
+            cold_start_info: ColdStartInfo::Unknown,
            // by default we don't do pooling
            pool_miss: true,
            // assume failed unless otherwise specified
-            outcome: "failed",
+            outcome: Outcome::Failed,
        }
    }
@@ -223,12 +239,8 @@ impl LatencyTimer {
        }
    }
-    pub fn cache_miss(&mut self) {
+    pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) {
-        self.cache_miss = true;
+        self.cold_start_info = cold_start_info;
    }
    pub fn pool_hit(&mut self) {
        self.pool_miss = false;
    }
    pub fn success(&mut self) {
@@ -236,7 +248,7 @@ impl LatencyTimer {
        self.stop = Some(time::Instant::now());
        // success
-        self.outcome = "success";
+        self.outcome = Outcome::Success;
    }
 }
@@ -261,9 +273,8 @@ impl Drop for LatencyTimer {
        COMPUTE_CONNECTION_LATENCY
            .with_label_values(&[
                self.protocol,
-                bool_to_str(self.cache_miss),
+                self.cold_start_info.as_str(),
-                bool_to_str(self.pool_miss),
+                self.outcome.as_str(),
                self.outcome,
                "client",
            ])
            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
@@ -272,9 +283,8 @@ impl Drop for LatencyTimer {
        COMPUTE_CONNECTION_LATENCY
            .with_label_values(&[
                self.protocol,
-                bool_to_str(self.cache_miss),
+                self.cold_start_info.as_str(),
-                bool_to_str(self.pool_miss),
+                self.outcome.as_str(),
                self.outcome,
                "client_and_cplane",
            ])
            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
 }
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
    ctx: &mut RequestMonitoring,
@@ -132,7 +131,6 @@ where
    } else {
        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
        info!("compute node's state has likely changed; requesting a wake-up");
        ctx.latency_timer.cache_miss();
        let old_node_info = invalidate_cache(node_info);
        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
        node_info.reuse_settings(old_node_info);
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -4,7 +4,7 @@ use crate::{
    console::messages::MetricsAuxInfo,
    metrics::NUM_BYTES_PROXIED_COUNTER,
    stream::Stream,
-    usage_metrics::{Ids, USAGE_METRICS},
+    usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
 };
 use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -19,8 +19,8 @@ pub async fn proxy_pass(
    aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
+        endpoint_id: aux.endpoint_id,
-        branch_id: aux.branch_id.clone(),
+        branch_id: aux.branch_id,
    });
    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -12,11 +12,12 @@ use crate::auth::backend::{
 };
 use crate::config::CertResolver;
 use crate::console::caches::NodeInfoCache;
 use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{http, sasl, scram};
+use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism {
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
    let node = NodeInfo {
        config: compute::ConnCfg::new(),
-        aux: Default::default(),
+        aux: MetricsAuxInfo {
            endpoint_id: (&EndpointId::from("endpoint")).into(),
            project_id: (&ProjectId::from("project")).into(),
            branch_id: (&BranchId::from("branch")).into(),
            cold_start_info: crate::console::messages::ColdStartInfo::Warm,
        },
        allow_self_signed_compute: false,
    };
    let (_, node) = cache.insert("key".into(), node);
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,7 +9,6 @@ use crate::{
    config::ProxyConfig,
    console::{
        errors::{GetAuthInfoError, WakeComputeError},
        messages::ColdStartInfo,
        CachedNodeInfo,
    },
    context::RequestMonitoring,
@@ -57,7 +56,10 @@ impl PoolingBackend {
        let auth_outcome =
            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
        let res = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => Ok(key),
+            crate::sasl::Outcome::Success(key) => {
                info!("user successfully authenticated");
                Ok(key)
            }
            crate::sasl::Outcome::Failure(reason) => {
                info!("auth backend failed with an error: {reason}");
                Err(AuthError::auth_failed(&*conn_info.user_info.user))
@@ -89,8 +91,6 @@ impl PoolingBackend {
        };
        if let Some(client) = maybe_client {
            info!("cold_start_info=warm");
            ctx.set_cold_start_info(ColdStartInfo::Warm);
            return Ok(client);
        }
        let conn_id = uuid::Uuid::new_v4();
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -17,7 +17,7 @@ use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
-use crate::console::messages::MetricsAuxInfo;
+use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
@@ -383,9 +383,12 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                    "pid",
                    &tracing::field::display(client.inner.get_process_id()),
                );
-                info!("pool: reusing connection '{conn_info}'");
+                info!(
                    cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                    "pool: reusing connection '{conn_info}'"
                );
                client.session.send(ctx.session_id)?;
-                ctx.latency_timer.pool_hit();
+                ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
                ctx.latency_timer.success();
                return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
            }
@@ -454,8 +457,9 @@ pub fn poll_client<C: ClientInnerExt>(
    let (tx, mut rx) = tokio::sync::watch::channel(session_id);
    let span = info_span!(parent: None, "connection", %conn_id);
    let cold_start_info = ctx.cold_start_info;
    span.in_scope(|| {
-        info!(%conn_info, %session_id, "new connection");
+        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
    });
    let pool = match conn_info.endpoint_cache_key() {
        Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
@@ -565,8 +569,8 @@ impl<C: ClientInnerExt> Client<C> {
    pub fn metrics(&self) -> Arc<MetricCounter> {
        let aux = &self.inner.as_ref().unwrap().aux;
        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id.clone(),
+            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id.clone(),
+            branch_id: aux.branch_id,
        })
    }
 }
@@ -666,6 +670,8 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 mod tests {
    use std::{mem, sync::atomic::AtomicBool};
    use crate::{BranchId, EndpointId, ProjectId};
    use super::*;
    struct MockClient(Arc<AtomicBool>);
@@ -691,7 +697,12 @@ mod tests {
        ClientInner {
            inner: client,
            session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
-            aux: Default::default(),
+            aux: MetricsAuxInfo {
                endpoint_id: (&EndpointId::from("endpoint")).into(),
                project_id: (&ProjectId::from("project")).into(),
                branch_id: (&BranchId::from("branch")).into(),
                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
            },
            conn_id: uuid::Uuid::new_v4(),
        }
    }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -42,12 +42,15 @@ use crate::error::ReportableError;
 use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
 use crate::usage_metrics::MetricCounterRecorder;
 use crate::DbName;
 use crate::RoleName;
 use super::backend::PoolingBackend;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -219,14 +222,7 @@ pub async fn handle(
    backend: Arc<PoolingBackend>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let cancel2 = cancel.clone();
    let handle = tokio::spawn(async move {
        time::sleep(config.http_config.request_timeout).await;
        cancel2.cancel();
    });
    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
    handle.abort();
    let mut response = match result {
        Ok(r) => {
@@ -237,10 +233,7 @@ pub async fn handle(
            let error_kind = e.get_error_kind();
            ctx.set_error_kind(error_kind);
-            let message = format!(
+            let message = "Query cancelled, connection was terminated";
                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
                config.http_config.request_timeout.as_secs_f64()
            );
            tracing::info!(
                kind=error_kind.to_metric_label(),
@@ -434,6 +427,63 @@ impl ReportableError for SqlOverHttpCancel {
    }
 }
 #[derive(Clone, Copy, Debug)]
 struct HttpHeaders {
    raw_output: bool,
    default_array_mode: bool,
    txn_isolation_level: Option<IsolationLevel>,
    txn_read_only: bool,
    txn_deferrable: bool,
 }
 impl HttpHeaders {
    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
        // Determine the output options. Default behaviour is 'false'. Anything that is not
        // strictly 'true' assumed to be false.
        let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
        let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
        // isolation level, read only and deferrable
        let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) {
            Some(x) => Some(
                map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?,
            ),
            None => None,
        };
        let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
        let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
        Ok(Self {
            raw_output,
            default_array_mode,
            txn_isolation_level,
            txn_read_only,
            txn_deferrable,
        })
    }
 }
 fn map_header_to_isolation_level(level: &HeaderValue) -> Option<IsolationLevel> {
    match level.as_bytes() {
        b"Serializable" => Some(IsolationLevel::Serializable),
        b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted),
        b"ReadCommitted" => Some(IsolationLevel::ReadCommitted),
        b"RepeatableRead" => Some(IsolationLevel::RepeatableRead),
        _ => None,
    }
 }
 fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue> {
    match level {
        IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")),
        IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")),
        IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")),
        IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")),
        _ => None,
    }
 }
 async fn handle_inner(
    cancel: CancellationToken,
    config: &'static ProxyConfig,
@@ -450,43 +500,26 @@ async fn handle_inner(
    // Determine the destination and connection params
    //
    let headers = request.headers();
    // TLS config should be there.
    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
    info!(user = conn_info.user_info.user.as_str(), "credentials");
    // Determine the output options. Default behaviour is 'false'. Anything that is not
    // strictly 'true' assumed to be false.
    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
    // Allow connection pooling only if explicitly requested
    // or if we have decided that http pool is no longer opt-in
    let allow_pool = !config.http_config.pool_options.opt_in
        || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
-    // isolation level, read only and deferrable
+    let parsed_headers = HttpHeaders::try_parse(headers)?;
    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
    let txn_isolation_level = match txn_isolation_level_raw {
        Some(ref x) => Some(match x.as_bytes() {
            b"Serializable" => IsolationLevel::Serializable,
            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
            b"ReadCommitted" => IsolationLevel::ReadCommitted,
            b"RepeatableRead" => IsolationLevel::RepeatableRead,
            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
        }),
        None => None,
    };
    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
    };
    info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
+    HTTP_CONTENT_LENGTH
        .with_label_values(&["request"])
        .observe(request_content_length as f64);
    // we don't have a streaming request support yet so this is to prevent OOM
    // from a malicious user sending an extremely large request body
@@ -514,20 +547,18 @@ async fn handle_inner(
    }
    .map_err(SqlOverHttpError::from);
-    // Run both operations in parallel
+    let (payload, mut client) = match run_until_cancelled(
-    let (payload, mut client) = match select(
+        // Run both operations in parallel
        try_join(
            pin!(fetch_and_process_request),
            pin!(authenticate_and_connect),
        ),
-        pin!(cancel.cancelled()),
+        &cancel,
    )
    .await
    {
-        Either::Left((result, _cancelled)) => result?,
+        Some(result) => result?,
-        Either::Right((_cancelled, _)) => {
+        None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)),
            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
        }
    };
    let mut response = Response::builder()
@@ -537,95 +568,143 @@ async fn handle_inner(
    //
    // Now execute the query and return the result
    //
    let mut size = 0;
    let result = match payload {
-        Payload::Single(stmt) => {
+        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
            let mut size = 0;
            let (inner, mut discard) = client.inner();
            let cancel_token = inner.cancel_token();
            let query = pin!(query_to_json(
                &*inner,
                stmt,
                &mut size,
                raw_output,
                default_array_mode
            ));
            let cancelled = pin!(cancel.cancelled());
            let res = select(query, cancelled).await;
            match res {
                Either::Left((Ok((status, results)), _cancelled)) => {
                    discard.check_idle(status);
                    results
                }
                Either::Left((Err(e), _cancelled)) => {
                    discard.discard();
                    return Err(e);
                }
                Either::Right((_cancelled, query)) => {
                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
                        tracing::error!(?err, "could not cancel query");
                    }
                    match time::timeout(time::Duration::from_millis(100), query).await {
                        Ok(Ok((status, results))) => {
                            discard.check_idle(status);
                            results
                        }
                        Ok(Err(error)) => {
                            let db_error = match &error {
                                SqlOverHttpError::ConnectCompute(
                                    HttpConnError::ConnectionError(e),
                                )
                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                                _ => None,
                            };
                            // if errored for some other reason, it might not be safe to return
                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
                                discard.discard();
                            }
                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                        }
                        Err(_timeout) => {
                            discard.discard();
                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                        }
                    }
                }
            }
        }
        Payload::Batch(statements) => {
-            info!("starting transaction");
+            if parsed_headers.txn_read_only {
-            let (inner, mut discard) = client.inner();
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
            let cancel_token = inner.cancel_token();
            let mut builder = inner.build_transaction();
            if let Some(isolation_level) = txn_isolation_level {
                builder = builder.isolation_level(isolation_level);
            }
-            if txn_read_only {
+            if parsed_headers.txn_deferrable {
-                builder = builder.read_only(true);
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
            }
-            if txn_deferrable {
+            if let Some(txn_isolation_level) = parsed_headers
-                builder = builder.deferrable(true);
+                .txn_isolation_level
-            }
+                .and_then(map_isolation_level_to_headers)
            let transaction = builder.start().await.map_err(|e| {
                // if we cannot start a transaction, we should return immediately
                // and not return to the pool. connection is clearly broken
                discard.discard();
                e
            })?;
            let results = match query_batch(
                cancel.child_token(),
                &transaction,
                statements,
                &mut size,
                raw_output,
                default_array_mode,
            )
            .await
            {
                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
            }
            statements
                .process(cancel, &mut client, parsed_headers)
                .await?
        }
    };
    let metrics = client.metrics();
    // how could this possibly fail
    let body = serde_json::to_string(&result).expect("json serialization should not fail");
    let len = body.len();
    let response = response
        .body(Body::from(body))
        // only fails if invalid status code or invalid header/values are given.
        // these are not user configurable so it cannot fail dynamically
        .expect("building response payload should not fail");
    // count the egress bytes - we miss the TLS and header overhead but oh well...
    // moving this later in the stack is going to be a lot of effort and ehhhh
    metrics.record_egress(len as u64);
    HTTP_CONTENT_LENGTH
        .with_label_values(&["response"])
        .observe(len as f64);
    Ok(response)
 }
 impl QueryData {
    async fn process(
        self,
        cancel: CancellationToken,
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
    ) -> Result<Value, SqlOverHttpError> {
        let (inner, mut discard) = client.inner();
        let cancel_token = inner.cancel_token();
        let res = match select(
            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
            pin!(cancel.cancelled()),
        )
        .await
        {
            // The query successfully completed.
            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
                discard.check_idle(status);
                Ok(results)
            }
            // The query failed with an error
            Either::Left((Err(e), __not_yet_cancelled)) => {
                discard.discard();
                return Err(e);
            }
            // The query was cancelled.
            Either::Right((_cancelled, query)) => {
                if let Err(err) = cancel_token.cancel_query(NoTls).await {
                    tracing::error!(?err, "could not cancel query");
                }
                // wait for the query cancellation
                match time::timeout(time::Duration::from_millis(100), query).await {
                    // query successed before it was cancelled.
                    Ok(Ok((status, results))) => {
                        discard.check_idle(status);
                        Ok(results)
                    }
                    // query failed or was cancelled.
                    Ok(Err(error)) => {
                        let db_error = match &error {
                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
                            | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                            _ => None,
                        };
                        // if errored for some other reason, it might not be safe to return
                        if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
                            discard.discard();
                        }
                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
                    }
                    Err(_timeout) => {
                        discard.discard();
                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
                    }
                }
            }
        };
        res
    }
 }
 impl BatchQueryData {
    async fn process(
        self,
        cancel: CancellationToken,
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
    ) -> Result<Value, SqlOverHttpError> {
        info!("starting transaction");
        let (inner, mut discard) = client.inner();
        let cancel_token = inner.cancel_token();
        let mut builder = inner.build_transaction();
        if let Some(isolation_level) = parsed_headers.txn_isolation_level {
            builder = builder.isolation_level(isolation_level);
        }
        if parsed_headers.txn_read_only {
            builder = builder.read_only(true);
        }
        if parsed_headers.txn_deferrable {
            builder = builder.deferrable(true);
        }
        let transaction = builder.start().await.map_err(|e| {
            // if we cannot start a transaction, we should return immediately
            // and not return to the pool. connection is clearly broken
            discard.discard();
            e
        })?;
        let results =
            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                Ok(results) => {
                    info!("commit");
                    let status = transaction.commit().await.map_err(|e| {
@@ -659,44 +738,15 @@ async fn handle_inner(
                }
            };
-            if txn_read_only {
+        Ok(json!({ "results": results }))
-                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
+    }
            }
            if txn_deferrable {
                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
            }
            if let Some(txn_isolation_level) = txn_isolation_level_raw {
                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
            }
            json!({ "results": results })
        }
    };
    let metrics = client.metrics();
    // how could this possibly fail
    let body = serde_json::to_string(&result).expect("json serialization should not fail");
    let len = body.len();
    let response = response
        .body(Body::from(body))
        // only fails if invalid status code or invalid header/values are given.
        // these are not user configurable so it cannot fail dynamically
        .expect("building response payload should not fail");
    // count the egress bytes - we miss the TLS and header overhead but oh well...
    // moving this later in the stack is going to be a lot of effort and ehhhh
    metrics.record_egress(len as u64);
    Ok(response)
 }
 async fn query_batch(
    cancel: CancellationToken,
    transaction: &Transaction<'_>,
    queries: BatchQueryData,
-    total_size: &mut usize,
+    parsed_headers: HttpHeaders,
    raw_output: bool,
    array_mode: bool,
 ) -> Result<Vec<Value>, SqlOverHttpError> {
    let mut results = Vec::with_capacity(queries.queries.len());
    let mut current_size = 0;
@@ -705,8 +755,7 @@ async fn query_batch(
            transaction,
            stmt,
            &mut current_size,
-            raw_output,
+            parsed_headers,
            array_mode
        ));
        let cancelled = pin!(cancel.cancelled());
        let res = select(query, cancelled).await;
@@ -723,7 +772,6 @@ async fn query_batch(
            }
        }
    }
    *total_size += current_size;
    Ok(results)
 }
@@ -731,8 +779,7 @@ async fn query_to_json<T: GenericClient>(
    client: &T,
    data: QueryData,
    current_size: &mut usize,
-    raw_output: bool,
+    parsed_headers: HttpHeaders,
    default_array_mode: bool,
 ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
    info!("executing query");
    let query_params = data.params;
@@ -792,12 +839,12 @@ async fn query_to_json<T: GenericClient>(
        columns.push(client.get_type(c.type_oid()).await?);
    }
-    let array_mode = data.array_mode.unwrap_or(default_array_mode);
+    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;
    // resulting JSON format is based on the format of node-postgres result
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,20 +1,35 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
+use crate::{
-use chrono::{DateTime, Utc};
+    config::{MetricBackupCollectionConfig, MetricCollectionConfig},
    context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    http,
    intern::{BranchIdInt, EndpointIdInt},
 };
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
 use futures::future::select;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
 use std::{
    convert::Infallible,
    pin::pin,
    sync::{
        atomic::{AtomicU64, AtomicUsize, Ordering},
        Arc,
    },
    time::Duration,
 };
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace};
 use utils::backoff;
 use uuid::{NoContext, Timestamp};
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
@@ -29,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: EndpointId,
+    pub endpoint_id: EndpointIdInt,
-    pub branch_id: BranchId,
+    pub branch_id: BranchIdInt,
 }
 pub trait MetricCounterRecorder {
    /// Record that some bytes were sent from the proxy to the client
    fn record_egress(&self, bytes: u64);
    /// Record that some connections were opened
    fn record_connection(&self, count: usize);
 }
 trait MetricCounterReporter {
    fn get_metrics(&mut self) -> (u64, usize);
    fn move_metrics(&self) -> (u64, usize);
 }
 #[derive(Debug)]
 struct MetricBackupCounter {
    transmitted: AtomicU64,
    opened_connections: AtomicUsize,
 }
 impl MetricCounterRecorder for MetricBackupCounter {
    fn record_egress(&self, bytes: u64) {
        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
    }
    fn record_connection(&self, count: usize) {
        self.opened_connections.fetch_add(count, Ordering::AcqRel);
    }
 }
 impl MetricCounterReporter for MetricBackupCounter {
    fn get_metrics(&mut self) -> (u64, usize) {
        (
            *self.transmitted.get_mut(),
            *self.opened_connections.get_mut(),
        )
    }
    fn move_metrics(&self) -> (u64, usize) {
        (
            self.transmitted.swap(0, Ordering::AcqRel),
            self.opened_connections.swap(0, Ordering::AcqRel),
        )
    }
 }
 #[derive(Debug)]
 pub struct MetricCounter {
    transmitted: AtomicU64,
    opened_connections: AtomicUsize,
    backup: Arc<MetricBackupCounter>,
 }
-impl MetricCounter {
+impl MetricCounterRecorder for MetricCounter {
    /// Record that some bytes were sent from the proxy to the client
-    pub fn record_egress(&self, bytes: u64) {
+    fn record_egress(&self, bytes: u64) {
        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
        self.backup.record_egress(bytes);
    }
    /// Record that some connections were opened
    fn record_connection(&self, count: usize) {
        self.opened_connections.fetch_add(count, Ordering::AcqRel);
        self.backup.record_connection(count);
    }
 }
 impl MetricCounterReporter for MetricCounter {
    fn get_metrics(&mut self) -> (u64, usize) {
        (
            *self.transmitted.get_mut(),
            *self.opened_connections.get_mut(),
        )
    }
    fn move_metrics(&self) -> (u64, usize) {
        (
            self.transmitted.swap(0, Ordering::AcqRel),
            self.opened_connections.swap(0, Ordering::AcqRel),
        )
    }
 }
 trait Clearable {
    /// extract the value that should be reported
    fn should_report(self: &Arc<Self>) -> Option<u64>;
    /// Determine whether the counter should be cleared from the global map.
    fn should_clear(self: &mut Arc<Self>) -> bool;
 }
 impl<C: MetricCounterReporter> Clearable for C {
    fn should_report(self: &Arc<Self>) -> Option<u64> {
        // heuristic to see if the branch is still open
        // if a clone happens while we are observing, the heuristic will be incorrect.
@@ -54,13 +143,12 @@ impl MetricCounter {
        // However, for the strong count to be 1 it must have occured that at one instant
        // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
        let is_open = Arc::strong_count(self) > 1;
        let opened = self.opened_connections.swap(0, Ordering::AcqRel);
        // update cached metrics eagerly, even if they can't get sent
        // (to avoid sending the same metrics twice)
        // see the relevant discussion on why to do so even if the status is not success:
        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        let value = self.transmitted.swap(0, Ordering::AcqRel);
+        let (value, opened) = self.move_metrics();
        // Our only requirement is that we report in every interval if there was an open connection
        // if there were no opened connections since, then we don't need to report
@@ -70,15 +158,12 @@ impl MetricCounter {
            Some(value)
        }
    }
    /// Determine whether the counter should be cleared from the global map.
    fn should_clear(self: &mut Arc<Self>) -> bool {
        // we can't clear this entry if it's acquired elsewhere
        let Some(counter) = Arc::get_mut(self) else {
            return false;
        };
-        let opened = *counter.opened_connections.get_mut();
+        let (opened, value) = counter.get_metrics();
        let value = *counter.transmitted.get_mut();
        // clear if there's no data to report
        value == 0 && opened == 0
    }
@@ -90,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 #[derive(Default)]
 pub struct Metrics {
    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
    backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 impl Metrics {
    /// Register a new byte metrics counter for this endpoint
    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
        let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
            entry.clone()
        } else {
            self.backup_endpoints
                .entry(ids.clone())
                .or_insert_with(|| {
                    Arc::new(MetricBackupCounter {
                        transmitted: AtomicU64::new(0),
                        opened_connections: AtomicUsize::new(0),
                    })
                })
                .clone()
        };
        let entry = if let Some(entry) = self.endpoints.get(&ids) {
            entry.clone()
        } else {
@@ -104,12 +204,13 @@ impl Metrics {
                    Arc::new(MetricCounter {
                        transmitted: AtomicU64::new(0),
                        opened_connections: AtomicUsize::new(0),
                        backup: backup.clone(),
                    })
                })
                .clone()
        };
-        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
+        entry.record_connection(1);
        entry
    }
 }
@@ -132,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
        let now = Utc::now();
        collect_metrics_iteration(
-            &USAGE_METRICS,
+            &USAGE_METRICS.endpoints,
            &http_client,
            &config.endpoint,
            &hostname,
@@ -144,24 +245,12 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
    }
 }
-#[instrument(skip_all)]
+fn collect_and_clear_metrics<C: Clearable>(
-async fn collect_metrics_iteration(
+    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
-    metrics: &Metrics,
+) -> Vec<(Ids, u64)> {
    client: &http::ClientWithMiddleware,
    metric_collection_endpoint: &reqwest::Url,
    hostname: &str,
    prev: DateTime<Utc>,
    now: DateTime<Utc>,
 ) {
    info!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
    );
    let mut metrics_to_clear = Vec::new();
-    let metrics_to_send: Vec<(Ids, u64)> = metrics
+    let metrics_to_send: Vec<(Ids, u64)> = endpoints
        .endpoints
        .iter()
        .filter_map(|counter| {
            let key = counter.key().clone();
@@ -173,33 +262,71 @@ async fn collect_metrics_iteration(
        })
        .collect();
    for metric in metrics_to_clear {
        match endpoints.entry(metric) {
            Entry::Occupied(mut counter) => {
                if counter.get_mut().should_clear() {
                    counter.remove_entry();
                }
            }
            Entry::Vacant(_) => {}
        }
    }
    metrics_to_send
 }
 fn create_event_chunks<'a>(
    metrics_to_send: &'a [(Ids, u64)],
    hostname: &'a str,
    prev: DateTime<Utc>,
    now: DateTime<Utc>,
    chunk_size: usize,
 ) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    metrics_to_send
        .chunks(chunk_size)
        .map(move |chunk| EventChunk {
            events: chunk
                .iter()
                .map(|(ids, value)| Event {
                    kind: EventType::Incremental {
                        start_time: prev,
                        stop_time: now,
                    },
                    metric: PROXY_IO_BYTES_PER_CLIENT,
                    idempotency_key: idempotency_key(hostname),
                    value: *value,
                    extra: ids.clone(),
                })
                .collect(),
        })
 }
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
    client: &http::ClientWithMiddleware,
    metric_collection_endpoint: &reqwest::Url,
    hostname: &str,
    prev: DateTime<Utc>,
    now: DateTime<Utc>,
 ) {
    info!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
    );
    let metrics_to_send = collect_and_clear_metrics(endpoints);
    if metrics_to_send.is_empty() {
        trace!("no new metrics to send");
    }
    // Send metrics.
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
        let events = chunk
            .iter()
            .map(|(ids, value)| Event {
                kind: EventType::Incremental {
                    start_time: prev,
                    stop_time: now,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
                idempotency_key: idempotency_key(hostname),
                value: *value,
                extra: Ids {
                    endpoint_id: ids.endpoint_id.clone(),
                    branch_id: ids.branch_id.clone(),
                },
            })
            .collect();
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&EventChunk { events })
+            .json(&chunk)
            .send()
            .await;
@@ -213,23 +340,142 @@ async fn collect_metrics_iteration(
        if !res.status().is_success() {
            error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
+            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
                // Report if the metric value is suspiciously large
                error!("potentially abnormal metric value: {:?}", metric);
            }
        }
    }
 }
-    for metric in metrics_to_clear {
+pub async fn task_backup(
-        match metrics.endpoints.entry(metric) {
+    backup_config: &MetricBackupCollectionConfig,
-            Entry::Occupied(mut counter) => {
+    cancellation_token: CancellationToken,
-                if counter.get_mut().should_clear() {
+) -> anyhow::Result<()> {
-                    counter.remove_entry();
+    info!("metrics backup config: {backup_config:?}");
-                }
+    scopeguard::defer! {
-            }
+        info!("metrics backup has shut down");
-            Entry::Vacant(_) => {}
+    }
    // Even if the remote storage is not configured, we still want to clear the metrics.
    let storage = backup_config
        .remote_storage_config
        .as_ref()
        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
        .transpose()?;
    let mut ticker = tokio::time::interval(backup_config.interval);
    let mut prev = Utc::now();
    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
    loop {
        select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
        let now = Utc::now();
        collect_metrics_backup_iteration(
            &USAGE_METRICS.backup_endpoints,
            &storage,
            &hostname,
            prev,
            now,
            backup_config.chunk_size,
        )
        .await;
        prev = now;
        if cancellation_token.is_cancelled() {
            info!("metrics backup has been cancelled");
            break;
        }
    }
    Ok(())
 }
 #[instrument(skip_all)]
 async fn collect_metrics_backup_iteration(
    endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
    storage: &Option<GenericRemoteStorage>,
    hostname: &str,
    prev: DateTime<Utc>,
    now: DateTime<Utc>,
    chunk_size: usize,
 ) {
    let year = now.year();
    let month = now.month();
    let day = now.day();
    let hour = now.hour();
    let minute = now.minute();
    let second = now.second();
    let cancel = CancellationToken::new();
    info!("starting collect_metrics_backup_iteration");
    let metrics_to_send = collect_and_clear_metrics(endpoints);
    if metrics_to_send.is_empty() {
        trace!("no new metrics to send");
    }
    // Send metrics.
    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
        let real_now = Utc::now();
        let id = uuid::Uuid::new_v7(Timestamp::from_unix(
            NoContext,
            real_now.second().into(),
            real_now.nanosecond(),
        ));
        let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
        let remote_path = match RemotePath::from_string(&path) {
            Ok(remote_path) => remote_path,
            Err(e) => {
                error!("failed to create remote path from str {path}: {:?}", e);
                continue;
            }
        };
        let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
        if let Err(e) = res {
            error!(
                "failed to upload consumption events to remote storage: {:?}",
                e
            );
        }
    }
 }
 async fn upload_events_chunk(
    storage: &Option<GenericRemoteStorage>,
    chunk: EventChunk<'_, Event<Ids, &'static str>>,
    remote_path: &RemotePath,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let storage = match storage {
        Some(storage) => storage,
        None => {
            error!("no remote storage configured");
            return Ok(());
        }
    };
    let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
    let mut encoder = GzipEncoder::new(Vec::new());
    encoder.write_all(&data).await.context("compress metrics")?;
    encoder.shutdown().await.context("compress metrics")?;
    let compressed_data: Bytes = encoder.get_ref().clone().into();
    backoff::retry(
        || async {
            let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
            storage
                .upload(stream, compressed_data.len(), remote_path, None, cancel)
                .await
        },
        TimeoutOrCancel::caused_by_cancel,
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_UPLOAD_MAX_RETRIES,
        "request_data_upload",
        cancel,
    )
    .await
    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
    .and_then(|x| x)
    .context("request_data_upload")?;
    Ok(())
 }
 #[cfg(test)]
@@ -248,8 +494,8 @@ mod tests {
    };
    use url::Url;
-    use super::{collect_metrics_iteration, Ids, Metrics};
+    use super::*;
-    use crate::{http, rate_limiter::RateLimiterConfig};
+    use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
    #[tokio::test]
    async fn metrics() {
@@ -284,18 +530,19 @@ mod tests {
        let now = Utc::now();
        // no counters have been registered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert!(r.is_empty());
        // register a new counter
        let counter = metrics.register(Ids {
-            endpoint_id: "e1".into(),
+            endpoint_id: (&EndpointId::from("e1")).into(),
-            branch_id: "b1".into(),
+            branch_id: (&BranchId::from("b1")).into(),
        });
        // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].events.len(), 1);
@@ -305,7 +552,7 @@ mod tests {
        counter.record_egress(1);
        // egress should be observered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].events.len(), 1);
@@ -315,11 +562,19 @@ mod tests {
        drop(counter);
        // we do not observe the counter
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert!(r.is_empty());
        // counter is unregistered
        assert!(metrics.endpoints.is_empty());
        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
            .await;
        assert!(!metrics.backup_endpoints.is_empty());
        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
            .await;
        // backup counter is unregistered after the second iteration
        assert!(metrics.backup_endpoints.is_empty());
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,4 +94,5 @@ select = [
    "I", # isort
    "W", # pycodestyle
    "B", # bugbear
    "UP032", # f-string
 ]
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -33,6 +33,7 @@ once_cell.workspace = true
 parking_lot.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -28,7 +28,7 @@ use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-    DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
@@ -170,6 +170,13 @@ struct Args {
    /// still needed for existing replication connection.
    #[arg(long)]
    walsenders_keep_horizon: bool,
    /// Enable partial backup. If disabled, safekeeper will not upload partial
    /// segments to remote storage.
    #[arg(long)]
    partial_backup_enabled: bool,
    /// Controls how long backup will wait until uploading the partial segment.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
    partial_backup_timeout: Duration,
 }
 // Like PathBufValueParser, but allows empty string.
@@ -300,6 +307,8 @@ async fn main() -> anyhow::Result<()> {
        http_auth,
        current_thread_runtime: args.current_thread_runtime,
        walsenders_keep_horizon: args.walsenders_keep_horizon,
        partial_backup_enabled: args.partial_backup_enabled,
        partial_backup_timeout: args.partial_backup_timeout,
    };
    // initialize sentry if SENTRY_DSN is provided
@@ -365,6 +374,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
    wal_backup::init_remote_storage(&conf);
    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
        FuturesUnordered::new();
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 use crate::SafeKeeperConf;
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 7;
+pub const SK_FORMAT_VERSION: u32 = 8;
 // contains persistent metadata for safekeeper
 const CONTROL_FILE_NAME: &str = "safekeeper.control";
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -2,6 +2,7 @@
 use crate::{
    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
    state::{PersistedPeers, TimelinePersistentState},
    wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
@@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 {
    pub peers: PersistedPeers,
 }
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct SafeKeeperStateV7 {
    #[serde(with = "hex")]
    pub tenant_id: TenantId,
    #[serde(with = "hex")]
    pub timeline_id: TimelineId,
    /// persistent acceptor state
    pub acceptor_state: AcceptorState,
    /// information about server
    pub server: ServerInfo,
    /// Unique id of the last *elected* proposer we dealt with. Not needed
    /// for correctness, exists for monitoring purposes.
    #[serde(with = "hex")]
    pub proposer_uuid: PgUuid,
    /// Since which LSN this timeline generally starts. Safekeeper might have
    /// joined later.
    pub timeline_start_lsn: Lsn,
    /// Since which LSN safekeeper has (had) WAL for this timeline.
    /// All WAL segments next to one containing local_start_lsn are
    /// filled with data from the beginning.
    pub local_start_lsn: Lsn,
    /// Part of WAL acknowledged by quorum *and available locally*. Always points
    /// to record boundary.
    pub commit_lsn: Lsn,
    /// LSN that points to the end of the last backed up segment. Useful to
    /// persist to avoid finding out offloading progress on boot.
    pub backup_lsn: Lsn,
    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
    /// of last record streamed to everyone). Persisting it helps skipping
    /// recovery in walproposer, generally we compute it from peers. In
    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
    /// only by walproposer.
    pub peer_horizon_lsn: Lsn,
    /// LSN of the oldest known checkpoint made by pageserver and successfully
    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
    /// informational purposes, we receive it from pageserver (or broker).
    pub remote_consistent_lsn: Lsn,
    // Peers and their state as we remember it. Knowing peers themselves is
    // fundamental; but state is saved here only for informational purposes and
    // obviously can be stale. (Currently not saved at all, but let's provision
    // place to have less file version upgrades).
    pub peers: PersistedPeers,
 }
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
    // migrate to storing full term history
    if version == 1 {
@@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
        });
    // migrate to hexing some ids
    } else if version == 2 {
@@ -190,6 +236,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
        });
    // migrate to moving tenant_id/timeline_id to the top and adding some lsns
    } else if version == 3 {
@@ -213,6 +260,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
        });
    // migrate to having timeline_start_lsn
    } else if version == 4 {
@@ -236,6 +284,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.peer_horizon_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
        });
    } else if version == 5 {
        info!("reading safekeeper control file version {}", version);
@@ -262,7 +311,30 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
        oldstate.server.pg_version = 140005;
        return Ok(oldstate);
    } else if version == 7 {
        info!("reading safekeeper control file version {}", version);
        let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?;
        return Ok(TimelinePersistentState {
            tenant_id: oldstate.tenant_id,
            timeline_id: oldstate.timeline_id,
            acceptor_state: oldstate.acceptor_state,
            server: oldstate.server,
            proposer_uuid: oldstate.proposer_uuid,
            timeline_start_lsn: oldstate.timeline_start_lsn,
            local_start_lsn: oldstate.local_start_lsn,
            commit_lsn: oldstate.commit_lsn,
            backup_lsn: oldstate.backup_lsn,
            peer_horizon_lsn: oldstate.peer_horizon_lsn,
            remote_consistent_lsn: oldstate.remote_consistent_lsn,
            peers: oldstate.peers,
            partial_backup: wal_backup_partial::State::default(),
        });
    }
    // TODO: persist the file back to the disk after upgrade
    // TODO: think about backward compatibility and rollbacks
    bail!("unsupported safekeeper control file version {}", version)
 }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -32,6 +32,7 @@ pub mod send_wal;
 pub mod state;
 pub mod timeline;
 pub mod wal_backup;
 pub mod wal_backup_partial;
 pub mod wal_service;
 pub mod wal_storage;
@@ -48,6 +49,7 @@ pub mod defaults {
    pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
    pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
 }
 #[derive(Debug, Clone)]
@@ -79,6 +81,8 @@ pub struct SafeKeeperConf {
    pub http_auth: Option<Arc<SwappableJwtAuth>>,
    pub current_thread_runtime: bool,
    pub walsenders_keep_horizon: bool,
    pub partial_backup_enabled: bool,
    pub partial_backup_timeout: Duration,
 }
 impl SafeKeeperConf {
@@ -123,6 +127,8 @@ impl SafeKeeperConf {
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
            walsenders_keep_horizon: false,
            partial_backup_enabled: false,
            partial_backup_timeout: Duration::from_secs(0),
        }
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
 });
 pub static PARTIAL_BACKUP_UPLOADS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "safekeeper_partial_backup_uploads_total",
        "Number of partial backup uploads to the S3",
        &["result"]
    )
    .expect("Failed to register safekeeper_partial_backup_uploads_total counter")
 });
 pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "safekeeper_partial_backup_uploaded_bytes_total",
        "Number of bytes uploaded to the S3 during partial backup"
    )
    .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
 });
 pub const LABEL_UNKNOWN: &str = "unknown";
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1221,6 +1221,7 @@ mod tests {
                    commit_lsn: Lsn(1234567600),
                },
            )]),
            partial_backup: crate::wal_backup_partial::State::default(),
        };
        let ser = state.ser().unwrap();
@@ -1266,6 +1267,8 @@ mod tests {
            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
            // partial_backup
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        ];
        assert_eq!(Hex(&ser), Hex(&expected));
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -13,6 +13,7 @@ use utils::{
 use crate::{
    control_file,
    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
    wal_backup_partial::{self},
 };
 /// Persistent information stored on safekeeper node about timeline.
@@ -54,11 +55,14 @@ pub struct TimelinePersistentState {
    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
    /// informational purposes, we receive it from pageserver (or broker).
    pub remote_consistent_lsn: Lsn,
-    // Peers and their state as we remember it. Knowing peers themselves is
+    /// Peers and their state as we remember it. Knowing peers themselves is
-    // fundamental; but state is saved here only for informational purposes and
+    /// fundamental; but state is saved here only for informational purposes and
-    // obviously can be stale. (Currently not saved at all, but let's provision
+    /// obviously can be stale. (Currently not saved at all, but let's provision
-    // place to have less file version upgrades).
+    /// place to have less file version upgrades).
    pub peers: PersistedPeers,
    /// Holds names of partial segments uploaded to remote storage. Used to
    /// clean up old objects without leaving garbage in remote storage.
    pub partial_backup: wal_backup_partial::State,
 }
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -93,6 +97,7 @@ impl TimelinePersistentState {
                    .map(|p| (*p, PersistedPeerInfo::new()))
                    .collect(),
            ),
            partial_backup: wal_backup_partial::State::default(),
        }
    }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_storage};
+use crate::{debug_dump, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 /// Things safekeeper should know about timeline state on peers.
@@ -503,6 +503,9 @@ impl Timeline {
        if conf.peer_recovery_enabled {
            tokio::spawn(recovery_main(self.clone(), conf.clone()));
        }
        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
        }
    }
    /// Delete timeline from disk completely, by removing timeline directory.
@@ -667,8 +670,8 @@ impl Timeline {
            term_flush_lsn =
                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
        }
        self.commit_lsn_watch_tx.send(commit_lsn)?;
        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
        self.commit_lsn_watch_tx.send(commit_lsn)?;
        Ok(rmsg)
    }
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
 use tokio::fs::File;
 use tokio::select;
@@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
        .unwrap()
 }
 pub fn init_remote_storage(conf: &SafeKeeperConf) {
    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
    // dependencies to all tasks instead.
    REMOTE_STORAGE.get_or_init(|| {
        conf.remote_storage
            .as_ref()
            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
    });
 }
 const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main(
        conf.remote_storage
    );
    let conf_ = conf.clone();
    REMOTE_STORAGE.get_or_init(|| {
        conf_
            .remote_storage
            .as_ref()
            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
    });
    // Presence in this map means launcher is aware s3 offloading is needed for
    // the timeline, but task is started only if it makes sense for to offload
    // from this safekeeper.
@@ -518,6 +520,35 @@ async fn backup_object(
        .await
 }
 pub(crate) async fn backup_partial_segment(
    source_file: &Utf8Path,
    target_file: &RemotePath,
    size: usize,
 ) -> Result<()> {
    let storage = get_configured_remote_storage();
    let file = File::open(&source_file)
        .await
        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
    // limiting the file to read only the first `size` bytes
    let limited_file = tokio::io::AsyncReadExt::take(file, size as u64);
    let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE);
    let cancel = CancellationToken::new();
    storage
        .upload(
            file,
            size,
            target_file,
            Some(StorageMetadata::from([("sk_type", "partial_segment")])),
            &cancel,
        )
        .await
 }
 pub async fn read_object(
    file_path: &RemotePath,
    offset: u64,
@@ -604,6 +635,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    Ok(())
 }
 /// Used by wal_backup_partial.
 pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
    let cancel = CancellationToken::new(); // not really used
    let storage = get_configured_remote_storage();
    storage.delete_objects(paths, &cancel).await
 }
 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
    wal_seg_size: usize,
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -0,0 +1,407 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
 //! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
 //! was changed), the segment will be uploaded to S3 in about 15 minutes.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
 //! - `Segment` – the segment name, like `000000010000000000000001`
 //! - `Term` – current term
 //! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
 //! - `Commit` – commit_lsn in the same hex format
 //! - `NN` – safekeeper_id, like `1`
 //!
 //! The full object name example:
 //! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`
 //!
 //! Each safekeeper will keep info about remote partial segments in its control
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 use std::sync::Arc;
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 use tracing::{debug, error, info, instrument};
 use utils::lsn::Lsn;
 use crate::{
    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
    safekeeper::Term,
    timeline::Timeline,
    wal_backup, SafeKeeperConf,
 };
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
    /// Upload is in progress
    InProgress,
    /// Upload is finished
    Uploaded,
    /// Deletion is in progress
    Deleting,
 }
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PartialRemoteSegment {
    pub status: UploadStatus,
    pub name: String,
    pub commit_lsn: Lsn,
    pub flush_lsn: Lsn,
    pub term: Term,
 }
 impl PartialRemoteSegment {
    fn eq_without_status(&self, other: &Self) -> bool {
        self.name == other.name
            && self.commit_lsn == other.commit_lsn
            && self.flush_lsn == other.flush_lsn
            && self.term == other.term
    }
 }
 // NB: these structures are a part of a control_file, you can't change them without
 // changing the control file format version.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
 pub struct State {
    pub segments: Vec<PartialRemoteSegment>,
 }
 impl State {
    /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
    fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
        self.segments
            .iter()
            .find(|seg| seg.status == UploadStatus::Uploaded)
            .cloned()
    }
 }
 struct PartialBackup {
    wal_seg_size: usize,
    tli: Arc<Timeline>,
    conf: SafeKeeperConf,
    local_prefix: Utf8PathBuf,
    remote_prefix: Utf8PathBuf,
    state: State,
 }
 // Read-only methods for getting segment names
 impl PartialBackup {
    fn segno(&self, lsn: Lsn) -> XLogSegNo {
        lsn.segment_number(self.wal_seg_size)
    }
    fn segment_name(&self, segno: u64) -> String {
        XLogFileName(PG_TLI, segno, self.wal_seg_size)
    }
    fn remote_segment_name(
        &self,
        segno: u64,
        term: u64,
        commit_lsn: Lsn,
        flush_lsn: Lsn,
    ) -> String {
        format!(
            "{}_{}_{:016X}_{:016X}_sk{}.partial",
            self.segment_name(segno),
            term,
            flush_lsn.0,
            commit_lsn.0,
            self.conf.my_id.0,
        )
    }
    fn local_segment_name(&self, segno: u64) -> String {
        format!("{}.partial", self.segment_name(segno))
    }
 }
 impl PartialBackup {
    /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded.
    async fn prepare_upload(&self) -> PartialRemoteSegment {
        // this operation takes a lock to get the actual state
        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
        let flush_lsn = Lsn(sk_info.flush_lsn);
        let commit_lsn = Lsn(sk_info.commit_lsn);
        let term = sk_info.term;
        let segno = self.segno(flush_lsn);
        let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
        PartialRemoteSegment {
            status: UploadStatus::InProgress,
            name,
            commit_lsn,
            flush_lsn,
            term,
        }
    }
    /// Reads segment from disk and uploads it to the remote storage.
    async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> {
        let flush_lsn = prepared.flush_lsn;
        let segno = self.segno(flush_lsn);
        // We're going to backup bytes from the start of the segment up to flush_lsn.
        let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
        let local_path = self.local_prefix.join(self.local_segment_name(segno));
        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
        // Upload first `backup_bytes` bytes of the segment to the remote storage.
        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
        PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
        // We uploaded the segment, now let's verify that the data is still actual.
        // If the term changed, we cannot guarantee the validity of the uploaded data.
        // If the term is the same, we know the data is not corrupted.
        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
        if sk_info.term != prepared.term {
            anyhow::bail!("term changed during upload");
        }
        assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
        assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn));
        Ok(())
    }
    /// Write new state to disk. If in-memory and on-disk states diverged, returns an error.
    async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> {
        self.tli
            .map_control_file(|cf| {
                if cf.partial_backup != self.state {
                    let memory = self.state.clone();
                    self.state = cf.partial_backup.clone();
                    anyhow::bail!(
                        "partial backup state diverged, memory={:?}, disk={:?}",
                        memory,
                        cf.partial_backup
                    );
                }
                cf.partial_backup = new_state.clone();
                Ok(())
            })
            .await?;
        // update in-memory state
        self.state = new_state;
        Ok(())
    }
    /// Upload the latest version of the partial segment and garbage collect older versions.
    #[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
    async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
        info!("starting upload {:?}", prepared);
        let state_0 = self.state.clone();
        let state_1 = {
            let mut state = state_0.clone();
            state.segments.push(prepared.clone());
            state
        };
        // we're going to upload a new segment, let's write it to disk to make GC later
        self.commit_state(state_1).await?;
        self.upload_segment(prepared.clone()).await?;
        let state_2 = {
            let mut state = state_0.clone();
            for seg in state.segments.iter_mut() {
                seg.status = UploadStatus::Deleting;
            }
            let mut actual_remote_segment = prepared.clone();
            actual_remote_segment.status = UploadStatus::Uploaded;
            state.segments.push(actual_remote_segment);
            state
        };
        // we've uploaded new segment, it's actual, all other segments should be GCed
        self.commit_state(state_2).await?;
        self.gc().await?;
        Ok(())
    }
    /// Delete all non-Uploaded segments from the remote storage. There should be only one
    /// Uploaded segment at a time.
    #[instrument(name = "gc", skip_all)]
    async fn gc(&mut self) -> anyhow::Result<()> {
        let mut segments_to_delete = vec![];
        let new_segments: Vec<PartialRemoteSegment> = self
            .state
            .segments
            .iter()
            .filter_map(|seg| {
                if seg.status == UploadStatus::Uploaded {
                    Some(seg.clone())
                } else {
                    segments_to_delete.push(seg.name.clone());
                    None
                }
            })
            .collect();
        info!("deleting objects: {:?}", segments_to_delete);
        let mut objects_to_delete = vec![];
        for seg in segments_to_delete.iter() {
            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
            objects_to_delete.push(remote_path);
        }
        // removing segments from remote storage
        wal_backup::delete_objects(&objects_to_delete).await?;
        // now we can update the state on disk
        let new_state = {
            let mut state = self.state.clone();
            state.segments = new_segments;
            state
        };
        self.commit_state(new_state).await?;
        Ok(())
    }
 }
 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
    let mut cancellation_rx = match tli.get_cancellation_rx() {
        Ok(rx) => rx,
        Err(_) => {
            info!("timeline canceled during task start");
            return;
        }
    };
    // sleep for random time to avoid thundering herd
    {
        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
        let sleep_duration = await_duration.mul_f64(randf64);
        tokio::time::sleep(sleep_duration).await;
    }
    let (_, persistent_state) = tli.get_state().await;
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
    let wal_seg_size = tli.get_wal_seg_size().await;
    let local_prefix = tli.timeline_dir.clone();
    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
        Ok(path) => path.to_owned(),
        Err(e) => {
            error!("failed to strip workspace dir prefix: {:?}", e);
            return;
        }
    };
    let mut backup = PartialBackup {
        wal_seg_size,
        tli,
        state: persistent_state.partial_backup,
        conf,
        local_prefix,
        remote_prefix,
    };
    debug!("state: {:?}", backup.state);
    'outer: loop {
        // wait until we have something to upload
        let uploaded_segment = backup.state.uploaded_segment();
        if let Some(seg) = &uploaded_segment {
            // if we already uploaded something, wait until we have something new
            while flush_lsn_rx.borrow().lsn == seg.flush_lsn
                && *commit_lsn_rx.borrow() == seg.commit_lsn
                && flush_lsn_rx.borrow().term == seg.term
            {
                tokio::select! {
                    _ = cancellation_rx.changed() => {
                        info!("timeline canceled");
                        return;
                    }
                    _ = commit_lsn_rx.changed() => {}
                    _ = flush_lsn_rx.changed() => {}
                }
            }
        }
        // if we don't have any data and zero LSNs, wait for something
        while flush_lsn_rx.borrow().lsn == Lsn(0) {
            tokio::select! {
                _ = cancellation_rx.changed() => {
                    info!("timeline canceled");
                    return;
                }
                _ = flush_lsn_rx.changed() => {}
            }
        }
        // fixing the segno and waiting some time to prevent reuploading the same segment too often
        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
        let timeout = tokio::time::sleep(await_duration);
        tokio::pin!(timeout);
        let mut timeout_expired = false;
        // waiting until timeout expires OR segno changes
        'inner: loop {
            tokio::select! {
                _ = cancellation_rx.changed() => {
                    info!("timeline canceled");
                    return;
                }
                _ = commit_lsn_rx.changed() => {}
                _ = flush_lsn_rx.changed() => {
                    let segno = backup.segno(flush_lsn_rx.borrow().lsn);
                    if segno != pending_segno {
                        // previous segment is no longer partial, aborting the wait
                        break 'inner;
                    }
                }
                _ = &mut timeout => {
                    // timeout expired, now we are ready for upload
                    timeout_expired = true;
                    break 'inner;
                }
            }
        }
        if !timeout_expired {
            // likely segno has changed, let's try again in the next iteration
            continue 'outer;
        }
        let prepared = backup.prepare_upload().await;
        if let Some(seg) = &uploaded_segment {
            if seg.eq_without_status(&prepared) {
                // we already uploaded this segment, nothing to do
                continue 'outer;
            }
        }
        match backup.do_upload(&prepared).await {
            Ok(()) => {
                debug!(
                    "uploaded {} up to flush_lsn {}",
                    prepared.name, prepared.flush_lsn
                );
                PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc();
            }
            Err(e) => {
                info!("failed to upload {}: {:#}", prepared.name, e);
                PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc();
            }
        }
    }
 }
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -176,6 +176,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        http_auth: None,
        current_thread_runtime: false,
        walsenders_keep_horizon: false,
        partial_backup_enabled: false,
        partial_backup_timeout: Duration::from_secs(0),
    };
    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
    Returns basepath for files with captured output.
    """
    assert isinstance(cmd, list)
-    base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
+    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
    basepath = os.path.join(capture_dir, base)
    stdout_filename = basepath + ".stdout"
    stderr_filename = basepath + ".stderr"
    with open(stdout_filename, "w") as stdout_f:
        with open(stderr_filename, "w") as stderr_f:
-            print('(capturing output to "{}.stdout")'.format(base))
+            print(f'(capturing output to "{base}.stdout")')
            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
    return basepath
@@ -82,11 +82,9 @@ class PgBin:
    def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
        self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin")
+        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
        self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(
+        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
            str(pg_distrib_dir), "v{}".format(pg_version), "lib"
        )
    def _fixpath(self, command: List[str]):
        if "/" not in command[0]:
@@ -110,7 +108,7 @@ class PgBin:
        """
        self._fixpath(command)
-        print('Running command "{}"'.format(" ".join(command)))
+        print(f'Running command "{" ".join(command)}"')
        env = self._build_env(env)
        subprocess.run(command, env=env, cwd=cwd, check=True)
@@ -128,7 +126,7 @@ class PgBin:
        """
        self._fixpath(command)
-        print('Running command "{}"'.format(" ".join(command)))
+        print(f'Running command "{" ".join(command)}"')
        env = self._build_env(env)
        return subprocess_capture(
            str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
@@ -300,7 +298,7 @@ class NeonPageserverHttpClient(requests.Session):
 def lsn_to_hex(num: int) -> str:
    """Convert lsn from int to standard hex notation."""
-    return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF)
+    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
 def lsn_from_hex(lsn_hex: str) -> int:
@@ -331,16 +329,12 @@ def wait_for_upload(
        if current_lsn >= lsn:
            return
        print(
-            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
+            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
                lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1
            )
        )
        time.sleep(1)
    raise Exception(
-        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
+        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
            lsn_to_hex(lsn), lsn_to_hex(current_lsn)
        )
    )
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "attachment_service"
+name = "storage_controller"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -25,6 +25,7 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
@@ -44,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
-utils = { path = "../../libs/utils/" }
+utils = { path = "../libs/utils/" }
-metrics = { path = "../../libs/metrics/" }
+metrics = { path = "../libs/metrics/" }
-control_plane = { path = ".." }
+control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/Show More
+++ b/Show More