Merge pull request #7248 from neondatabase/rc/2024-03-26

Release 2024-03-26
Merge pull request #7219 from neondatabase/rc/2024-03-25
2026-08-03 03:10:38 +00:00 · 2024-03-26 15:17:00 +00:00 · 2024-03-25 12:28:09 +00:00 · 2024-03-19 12:07:14 +01:00 · 2024-03-18 16:28:17 +01:00 · 2024-03-18 13:01:17 +01:00
134 changed files with 1897 additions and 5697 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,16 +147,15 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+                                                   { "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neonvm-captest-sharding-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
-            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -274,15 +270,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1127,7 +1127,6 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
-              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
@@ -1137,7 +1136,6 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
-              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1146,7 +1144,6 @@ jobs:
              -f deployProxy=true \
              -f deployStorage=false \
              -f deployStorageBroker=false \
-              -f deployStorageController=false \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, gen3, small ]
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    steps:
      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
            fi
          done

-      - name: Set e2e-platforms
-        id: e2e-platforms
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          # Default set of platforms to run e2e tests on
-          platforms='["docker", "k8s"]'
-
-          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
-          # If the workflow run is not a pull request, add k8s-neonvm to the list.
-          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
-            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
-              case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
-                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-                  ;;
-                *)
-                  # no-op
-                  ;;
-              esac
-            done
-          else
-            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-          fi
-
-          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
-
      - name: Set PR's status to pending and request a remote CI test
-        env:
-          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

-          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
-            --method POST \
-            --raw-field "state=pending" \
-            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
-            --raw-field "context=neon-cloud-e2e"
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"

-          gh workflow --repo ${REMOTE_REPO} \
-            run testing.yml \
-              --ref "main" \
-              --raw-field "ci_job_name=neon-cloud-e2e" \
-              --raw-field "commit_hash=$COMMIT_SHA" \
-              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
-              --raw-field "storage_image_tag=${TAG}" \
-              --raw-field "compute_image_tag=${TAG}" \
-              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
-              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${TAG}\",
+                \"compute_image_tag\": \"${TAG}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -288,7 +288,6 @@ dependencies = [
 "hex",
 "humantime",
 "hyper",
- "itertools",
 "lasso",
 "measured",
 "metrics",
@@ -2235,9 +2234,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.26"
+version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
+checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
 dependencies = [
 "bytes",
 "fnv",
@@ -3436,9 +3435,9 @@ dependencies = [

 [[package]]
 name = "ordered-multimap"
-version = "0.7.3"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
+checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
 dependencies = [
 "dlv-list",
 "hashbrown 0.14.0",
@@ -3582,7 +3581,6 @@ dependencies = [
 "strum_macros",
 "svg_fmt",
 "sync_wrapper",
- "sysinfo",
 "tenant_size_model",
 "thiserror",
 "tokio",
@@ -4200,7 +4198,6 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-compression",
 "async-trait",
 "aws-config",
 "aws-sdk-iam",
@@ -5623,26 +5620,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storcon_cli"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "comfy-table",
- "hyper",
- "pageserver_api",
- "pageserver_client",
- "reqwest",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5956,9 +5933,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.37.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
 "backtrace",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,6 @@ members = [
    "compute_tools",
    "control_plane",
    "control_plane/attachment_service",
-    "control_plane/storcon_cli",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

-# Create remote extension download directory
-RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
-
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1262,12 +1262,10 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);

-        if download_size.is_ok() {
-            self.ext_download_progress
-                .write()
-                .expect("bad lock")
-                .insert(ext_archive_name.to_string(), (download_start, true));
-        }
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));

        download_size
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -743,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
-        error!(
-            "failed to upgrade neon extension during `handle_extension_neon`: {}",
-            e
-        );
-    }
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;

    Ok(())
 }

 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade (not really)");
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;

    Ok(())
 }
@@ -809,8 +806,19 @@ $$;"#,
        "",
        "",
        "",
-        "",
        // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -25,7 +25,6 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
@@ -1,3 +0,0 @@
-- This file should undo anything in `up.sql`
-
-ALTER TABLE tenant_shards drop scheduling_policy;
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
@@ -1,2 +0,0 @@
-
-ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -14,6 +14,7 @@ use utils::{

 use crate::service::Config;

+const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;
@@ -279,10 +280,11 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // We consider this fatal, because it's possible that the operation blocking the control one is
-                // also the one that is waiting for this reconcile.  We should let the reconciler calling
-                // this hook fail, to give control plane a chance to un-lock.
-                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
+                // is not appropriate
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -304,12 +306,7 @@ impl ComputeHook {
        let client = reqwest::Client::new();
        backoff::retry(
            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| {
-                matches!(
-                    e,
-                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
-                )
-            },
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
            3,
            10,
            "Send compute notification",
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -34,8 +34,7 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
-    TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

@@ -399,15 +398,6 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

-async fn handle_tenant_list(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    json_response(StatusCode::OK, service.tenant_list())
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -421,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;

    let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
-    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
-
-    json_response(StatusCode::OK, api_nodes)
+    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -491,22 +478,6 @@ async fn handle_tenant_shard_migrate(
    )
 }

-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .tenant_update_policy(tenant_id, update_req)
-            .await?,
-    )
-}
-
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -538,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }

-async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
-}
-
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -763,9 +726,6 @@ pub fn make_router(
                RequestName("debug_v1_consistency_check"),
            )
        })
-        .post("/debug/v1/reconcile_all", |r| {
-            request_span(r, handle_reconcile_all)
-        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
@@ -805,16 +765,6 @@ pub fn make_router(
                RequestName("control_v1_tenant_describe"),
            )
        })
-        .get("/control/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
-        })
-        .put("/control/v1/tenant/:tenant_id/policy", |r| {
-            named_request_span(
-                r,
-                handle_tenant_update_policy,
-                RequestName("control_v1_tenant_policy"),
-            )
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -37,9 +37,6 @@ pub(crate) struct StorageControllerMetricGroup {
    pub(crate) storage_controller_reconcile_complete:
        measured::CounterVec<ReconcileCompleteLabelGroupSet>,

-    /// Count of how many times we make an optimization change to a tenant's scheduling
-    pub(crate) storage_controller_schedule_optimization: measured::Counter,
-
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
@@ -104,7 +101,6 @@ impl StorageControllerMetricGroup {
                    status: StaticLabelSet::new(),
                },
            ),
-            storage_controller_schedule_optimization: measured::Counter::new(),
            storage_controller_http_request_status: measured::CounterVec::new(
                HttpRequestStatusLabelGroupSet {
                    path: lasso::ThreadedRodeo::new(),
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
@@ -257,19 +256,6 @@ impl Node {
        )
        .await
    }
-
-    /// Generate the simplified API-friendly description of a node's state
-    pub(crate) fn describe(&self) -> NodeDescribeResponse {
-        NodeDescribeResponse {
-            id: self.id,
-            availability: self.availability.into(),
-            scheduling: self.scheduling,
-            listen_http_addr: self.listen_http_addr.clone(),
-            listen_http_port: self.listen_http_port,
-            listen_pg_addr: self.listen_pg_addr.clone(),
-            listen_pg_port: self.listen_pg_port,
-        }
-    }
 }

 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
@@ -108,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {

 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

-/// Some methods can operate on either a whole tenant or a single shard
-pub(crate) enum TenantFilter {
-    Tenant(TenantId),
-    Shard(TenantShardId),
-}
-
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -147,7 +140,7 @@ impl Persistence {
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
@@ -175,7 +168,7 @@ impl Persistence {
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -282,11 +275,6 @@ impl Persistence {
                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
                shard.placement_policy = "{\"Attached\":0}".to_string();
            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
-            }
        }

        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -477,45 +465,59 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant: TenantFilter,
-        input_placement_policy: Option<PlacementPolicy>,
-        input_config: Option<TenantConfig>,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
        input_generation: Option<Generation>,
-        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));

-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
            }

-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-            };
+            Ok(())
+        })
+        .await?;

-            query.set(update).execute(conn)?;
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;

            Ok(())
        })
@@ -726,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
-    #[serde(default)]
-    pub(crate) scheduling_policy: String,
 }

 impl TenantShardPersistence {
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -487,7 +487,6 @@ impl Reconciler {
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
-                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                _ => {
                    tracing::warn!(
                        "Live migration blocked by compute notification error, retrying: {e}"
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -58,70 +58,6 @@ pub(crate) struct Scheduler {
    nodes: HashMap<NodeId, SchedulerNode>,
 }

-/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
-///
-/// For example, we may set an affinity score based on the number of shards from the same
-/// tenant already on a node, to implicitly prefer to balance out shards.
-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
-pub(crate) struct AffinityScore(pub(crate) usize);
-
-impl AffinityScore {
-    /// If we have no anti-affinity at all toward a node, this is its score.  It means
-    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
-    /// based on other information such as total utilization.
-    pub(crate) const FREE: Self = Self(0);
-
-    pub(crate) fn inc(&mut self) {
-        self.0 += 1;
-    }
-}
-
-impl std::ops::Add for AffinityScore {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self(self.0 + rhs.0)
-    }
-}
-
-// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
-// it for many shards in the same tenant.
-#[derive(Debug, Default)]
-pub(crate) struct ScheduleContext {
-    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
-    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
-
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-}
-
-impl ScheduleContext {
-    /// Input is a list of nodes we would like to avoid using again within this context.  The more
-    /// times a node is passed into this call, the less inclined we are to use it.
-    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
-        for node_id in nodes {
-            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
-            entry.inc()
-        }
-    }
-
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
-    }
-}
-
 impl Scheduler {
    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
        let mut scheduler_nodes = HashMap::new();
@@ -288,47 +224,27 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }

-    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
-    /// are already in use by this shard -- we use this to avoid picking the same node
-    /// as both attached and secondary location.  This is a hard constraint: if we cannot
-    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
-    ///
-    /// context: we prefer to avoid using nodes identified in the context, according
-    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
-    /// the same tenant on the same node.  This is a soft constraint: the context will never
-    /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
-        &self,
-        hard_exclude: &[NodeId],
-        context: &ScheduleContext,
-    ) -> Result<NodeId, ScheduleError> {
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                    None
                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                    ))
+                    Some((*k, v.shard_count))
                }
            })
            .collect();

-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));

-        if scores.is_empty() {
+        if tenant_counts.is_empty() {
            // After applying constraints, no pageservers were left.  We log some detail about
            // the state of nodes to help understand why this happened.  This is not logged as an error because
            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
@@ -344,11 +260,10 @@ impl Scheduler {
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        // Lowest score wins
-        let node_id = scores.first().unwrap().0;
+        let node_id = tenant_counts.first().unwrap().0;
        tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );

        // Note that we do not update shard count here to reflect the scheduling: that
@@ -356,12 +271,6 @@ impl Scheduler {

        Ok(node_id)
    }
-
-    /// Unit test access to internal state
-    #[cfg(test)]
-    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
-        self.nodes.get(&node_id).unwrap().shard_count
-    }
 }

 #[cfg(test)]
@@ -407,17 +316,15 @@ mod tests {
        let mut t1_intent = IntentState::new();
        let mut t2_intent = IntentState::new();

-        let context = ScheduleContext::default();
-
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);

-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
        t1_intent.push_secondary(&mut scheduler, scheduled);

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,7 +22,6 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
-        scheduling_policy -> Varchar,
    }
 }

--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,10 +8,7 @@ use std::{
 };

 use crate::{
-    id_lock_map::IdLockMap,
-    persistence::{AbortShardSplitStatus, TenantFilter},
-    reconciler::ReconcileError,
-    scheduler::ScheduleContext,
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -20,14 +17,12 @@ use control_plane::storage_controller::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
-use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
-        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
-        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
-        UtilizationScore,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -56,6 +51,7 @@ use utils::{
    generation::Generation,
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
+    seqwait::SeqWait,
    sync::gate::Gate,
 };

@@ -70,6 +66,7 @@ use crate::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
        ReconcilerWaiter, TenantState,
    },
+    Sequence,
 };

 // For operations that should be quick, like attaching a new tenant
@@ -347,15 +344,9 @@ impl Service {
            }

            // Populate each tenant's intent state
-            let mut schedule_context = ScheduleContext::default();
            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-                if tenant_shard_id.shard_number == ShardNumber(0) {
-                    // Reset scheduling context each time we advance to the next Tenant
-                    schedule_context = ScheduleContext::default();
-                }
-
                tenant_state.intent_from_observed(scheduler);
-                if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) {
+                if let Err(e) = tenant_state.schedule(scheduler) {
                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                    // not enough pageservers are available.  The tenant may well still be available
                    // to clients.
@@ -679,13 +670,7 @@ impl Service {
        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
        while !self.cancel.is_cancelled() {
            tokio::select! {
-              _ = interval.tick() => {
-                let reconciles_spawned = self.reconcile_all();
-                if reconciles_spawned == 0 {
-                    // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all();
-                }
-            }
+              _ = interval.tick() => { self.reconcile_all(); }
              _ = self.cancel.cancelled() => return
            }
        }
@@ -972,14 +957,30 @@ impl Service {
        }
        for tsp in tenant_shard_persistence {
            let tenant_shard_id = tsp.get_tenant_shard_id()?;
-
+            let shard_identity = tsp.get_shard_identity()?;
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
            if let Some(generation_pageserver) = tsp.generation_pageserver {
                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
            }
-            let new_tenant = TenantState::from_persistent(tsp, intent)?;
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: tsp.generation.map(|g| Generation::new(g as u32)),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                splitting: tsp.splitting,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+                pending_compute_notification: false,
+            };

            tenants.insert(tenant_shard_id, new_tenant);
        }
@@ -1103,8 +1104,6 @@ impl Service {
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
-                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
-                    .unwrap(),
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1157,10 +1156,9 @@ impl Service {
                    // when we reattaching a detached tenant.
                    self.persistence
                        .update_tenant_shard(
-                            TenantFilter::Shard(attach_req.tenant_shard_id),
-                            Some(PlacementPolicy::Attached(0)),
-                            Some(conf),
-                            None,
+                            attach_req.tenant_shard_id,
+                            PlacementPolicy::Attached(0),
+                            conf,
                            None,
                        )
                        .await?;
@@ -1525,8 +1523,6 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<TenantCreateResponse, ApiError> {
-        let tenant_id = create_req.new_tenant_id.tenant_id;
-
        // Exclude any concurrent attempts to create/access the same tenant ID
        let _tenant_lock = self
            .tenant_op_locks
@@ -1535,12 +1531,7 @@ impl Service {

        let (response, waiters) = self.do_tenant_create(create_req).await?;

-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
-            // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
-            // accept compute notifications while it is in the process of creating.  Reconciliation will
-            // be retried in the background.
-            tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
-        }
+        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
        Ok(response)
    }

@@ -1617,31 +1608,15 @@ impl Service {
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
-                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
-                    .unwrap(),
            })
            .collect();
-
-        match self
-            .persistence
+        self.persistence
            .insert_tenant_shards(persist_tenant_shards)
            .await
-        {
-            Ok(_) => {}
-            Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
-                DatabaseErrorKind::UniqueViolation,
-                _,
-            ))) => {
-                // Unique key violation: this is probably a retry.  Because the shard count is part of the unique key,
-                // if we see a unique key violation it means that the creation request's shard count matches the previous
-                // creation's shard count.
-                tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
-            }
-            // Any other database error is unexpected and a bug.
-            Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
-        };
-
-        let mut schedule_context = ScheduleContext::default();
+            .map_err(|e| {
+                // TODO: distinguish primary key constraint (idempotent, OK), from other errors
+                ApiError::InternalServerError(anyhow::anyhow!(e))
+            })?;

        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
@@ -1664,14 +1639,11 @@ impl Service {
                        // attached and secondary locations (independently) away frorm those
                        // pageservers also holding a shard for this tenant.

-                        entry
-                            .get_mut()
-                            .schedule(scheduler, &mut schedule_context)
-                            .map_err(|e| {
-                                ApiError::Conflict(format!(
-                                    "Failed to schedule shard {tenant_shard_id}: {e}"
-                                ))
-                            })?;
+                        entry.get_mut().schedule(scheduler).map_err(|e| {
+                            ApiError::Conflict(format!(
+                                "Failed to schedule shard {tenant_shard_id}: {e}"
+                            ))
+                        })?;

                        if let Some(node_id) = entry.get().intent.get_attached() {
                            let generation = entry
@@ -1699,7 +1671,7 @@ impl Service {

                        state.generation = initial_generation;
                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
+                        if let Err(e) = state.schedule(scheduler) {
                            schcedule_error = Some(e);
                        }

@@ -1907,7 +1879,6 @@ impl Service {
                // Persist updates
                // Ordering: write to the database before applying changes in-memory, so that
                // we will not appear time-travel backwards on a restart.
-                let mut schedule_context = ScheduleContext::default();
                for ShardUpdate {
                    tenant_shard_id,
                    placement_policy,
@@ -1917,11 +1888,10 @@ impl Service {
                {
                    self.persistence
                        .update_tenant_shard(
-                            TenantFilter::Shard(*tenant_shard_id),
-                            Some(placement_policy.clone()),
-                            Some(tenant_config.clone()),
+                            *tenant_shard_id,
+                            placement_policy.clone(),
+                            tenant_config.clone(),
                            *generation,
-                            None,
                        )
                        .await?;
                }
@@ -1955,7 +1925,7 @@ impl Service {
                            shard.generation = Some(generation);
                        }

-                        shard.schedule(scheduler, &mut schedule_context)?;
+                        shard.schedule(scheduler)?;

                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
                        if let Some(waiter) = maybe_waiter {
@@ -1999,13 +1969,7 @@ impl Service {
        let config = req.config;

        self.persistence
-            .update_tenant_shard(
-                TenantFilter::Tenant(req.tenant_id),
-                None,
-                Some(config.clone()),
-                None,
-                None,
-            )
+            .update_tenant_config(req.tenant_id, config.clone())
            .await?;

        let waiters = {
@@ -2115,7 +2079,7 @@ impl Service {
            let scheduler = &locked.scheduler;
            // Right now we only perform the operation on a single node without parallelization
            // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id = scheduler.schedule_shard(&[])?;
            let node = locked
                .nodes
                .get(&node_id)
@@ -2358,58 +2322,6 @@ impl Service {
        Ok(StatusCode::NOT_FOUND)
    }

-    /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
-    /// for a tenant.  The TenantConfig is passed through to pageservers, whereas this function modifies
-    /// the tenant's policies (configuration) within the storage controller
-    pub(crate) async fn tenant_update_policy(
-        &self,
-        tenant_id: TenantId,
-        req: TenantPolicyRequest,
-    ) -> Result<(), ApiError> {
-        // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
-
-        let TenantPolicyRequest {
-            placement,
-            scheduling,
-        } = req;
-
-        self.persistence
-            .update_tenant_shard(
-                TenantFilter::Tenant(tenant_id),
-                placement.clone(),
-                None,
-                None,
-                scheduling,
-            )
-            .await?;
-
-        let mut schedule_context = ScheduleContext::default();
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            if let Some(placement) = &placement {
-                shard.policy = placement.clone();
-
-                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
-                               "Updated placement policy to {placement:?}");
-            }
-
-            if let Some(scheduling) = &scheduling {
-                shard.set_scheduling_policy(*scheduling);
-
-                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
-                               "Updated scheduling policy to {scheduling:?}");
-            }
-
-            // In case scheduling is being switched back on, try it now.
-            shard.schedule(scheduler, &mut schedule_context).ok();
-            self.maybe_reconcile_shard(shard, nodes);
-        }
-
-        Ok(())
-    }
-
    pub(crate) async fn tenant_timeline_create(
        &self,
        tenant_id: TenantId,
@@ -2736,71 +2648,45 @@ impl Service {
        })
    }

-    /// Returns None if the input iterator of shards does not include a shard with number=0
-    fn tenant_describe_impl<'a>(
-        &self,
-        shards: impl Iterator<Item = &'a TenantState>,
-    ) -> Option<TenantDescribeResponse> {
-        let mut shard_zero = None;
-        let mut describe_shards = Vec::new();
-
-        for shard in shards {
-            if shard.tenant_shard_id.is_zero() {
-                shard_zero = Some(shard);
-            }
-
-            describe_shards.push(TenantDescribeResponseShard {
-                tenant_shard_id: shard.tenant_shard_id,
-                node_attached: *shard.intent.get_attached(),
-                node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard.last_error.lock().unwrap().clone(),
-                is_reconciling: shard.reconciler.is_some(),
-                is_pending_compute_notification: shard.pending_compute_notification,
-                is_splitting: matches!(shard.splitting, SplitState::Splitting),
-                scheduling_policy: *shard.get_scheduling_policy(),
-            })
-        }
-
-        let shard_zero = shard_zero?;
-
-        Some(TenantDescribeResponse {
-            tenant_id: shard_zero.tenant_shard_id.tenant_id,
-            shards: describe_shards,
-            stripe_size: shard_zero.shard.stripe_size,
-            policy: shard_zero.policy.clone(),
-            config: shard_zero.config.clone(),
-        })
-    }
-
    pub(crate) fn tenant_describe(
        &self,
        tenant_id: TenantId,
    ) -> Result<TenantDescribeResponse, ApiError> {
        let locked = self.inner.read().unwrap();

-        self.tenant_describe_impl(
-            locked
-                .tenants
-                .range(TenantShardId::tenant_range(tenant_id))
-                .map(|(_k, v)| v),
-        )
-        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
-    }
+        let mut shard_zero = None;
+        let mut shards = Vec::new();

-    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
-        let locked = self.inner.read().unwrap();
-
-        let mut result = Vec::new();
-        for (_tenant_id, tenant_shards) in
-            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
+        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
        {
-            result.push(
-                self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
-                    .expect("Groups are always non-empty"),
-            );
+            if tenant_shard_id.is_zero() {
+                shard_zero = Some(shard);
+            }
+
+            let response_shard = TenantDescribeResponseShard {
+                tenant_shard_id: *tenant_shard_id,
+                node_attached: *shard.intent.get_attached(),
+                node_secondary: shard.intent.get_secondary().to_vec(),
+                last_error: shard.last_error.lock().unwrap().clone(),
+                is_reconciling: shard.reconciler.is_some(),
+                is_pending_compute_notification: shard.pending_compute_notification,
+                is_splitting: matches!(shard.splitting, SplitState::Splitting),
+            };
+            shards.push(response_shard);
        }

-        result
+        let Some(shard_zero) = shard_zero else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        Ok(TenantDescribeResponse {
+            shards,
+            stripe_size: shard_zero.shard.stripe_size,
+            policy: shard_zero.policy.clone(),
+            config: shard_zero.config.clone(),
+        })
    }

    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
@@ -2893,7 +2779,7 @@ impl Service {

                tracing::info!("Restoring parent shard {tenant_shard_id}");
                shard.splitting = SplitState::Idle;
-                if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
+                if let Err(e) = shard.schedule(scheduler) {
                    // If this shard can't be scheduled now (perhaps due to offline nodes or
                    // capacity issues), that must not prevent us rolling back a split.  In this
                    // case it should be eventually scheduled in the background.
@@ -3017,7 +2903,6 @@ impl Service {
                    )
                };

-                let mut schedule_context = ScheduleContext::default();
                for child in child_ids {
                    let mut child_shard = parent_ident;
                    child_shard.number = child.shard_number;
@@ -3053,7 +2938,7 @@ impl Service {

                    child_locations.push((child, pageserver, child_shard.stripe_size));

-                    if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) {
+                    if let Err(e) = child_state.schedule(scheduler) {
                        // This is not fatal, because we've implicitly already got an attached
                        // location for the child shard.  Failure here just means we couldn't
                        // find a secondary (e.g. because cluster is overloaded).
@@ -3346,10 +3231,6 @@ impl Service {
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    config: serde_json::to_string(&config).unwrap(),
                    splitting: SplitState::Splitting,
-
-                    // Scheduling policies do not carry through to children
-                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
-                        .unwrap(),
                });
            }

@@ -3917,7 +3798,6 @@ impl Service {
            AvailabilityTransition::ToOffline => {
                tracing::info!("Node {} transition to offline", node_id);
                let mut tenants_affected: usize = 0;
-
                for (tenant_shard_id, tenant_state) in tenants {
                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
@@ -3934,13 +3814,7 @@ impl Service {

                    if tenant_state.intent.demote_attached(node_id) {
                        tenant_state.sequence = tenant_state.sequence.next();
-
-                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
-                        // for tenants without secondary locations: if they have a secondary location, then this
-                        // schedule() call is just promoting an existing secondary)
-                        let mut schedule_context = ScheduleContext::default();
-
-                        match tenant_state.schedule(scheduler, &mut schedule_context) {
+                        match tenant_state.schedule(scheduler) {
                            Err(e) => {
                                // It is possible that some tenants will become unschedulable when too many pageservers
                                // go offline: in this case there isn't much we can do other than make the issue observable.
@@ -3991,6 +3865,9 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
+    ///
+    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
+    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3999,27 +3876,10 @@ impl Service {
        let mut waiters = Vec::new();
        let (nodes, tenants, scheduler) = locked.parts_mut();

-        let mut schedule_context = ScheduleContext::default();
-        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            shard.schedule(scheduler, &mut schedule_context)?;
-
-            // The shard's policies may not result in an attached location being scheduled: this
-            // is an error because our caller needs it attached somewhere.
-            if shard.intent.get_attached().is_none() {
-                return Err(anyhow::anyhow!(
-                    "Tenant {tenant_id} not scheduled to be attached"
-                ));
-            };
-
-            if shard.stably_attached().is_some() {
-                // We do not require the shard to be totally up to date on reconciliation: we just require
-                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
-                // locations, or compute hook notifications can be ignored.
-                continue;
-            }
+        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            shard.schedule(scheduler)?;

            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
-                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
                waiters.push(waiter);
            }
        }
@@ -4081,144 +3941,8 @@ impl Service {
        let (nodes, tenants, _scheduler) = locked.parts_mut();
        let pageservers = nodes.clone();

-        let mut schedule_context = ScheduleContext::default();
-
        let mut reconciles_spawned = 0;
-        for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_zero() {
-                schedule_context = ScheduleContext::default();
-            }
-
-            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
-            // dirty, spawn another rone
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
-                reconciles_spawned += 1;
-            }
-
-            schedule_context.avoid(&shard.intent.all_pageservers());
-        }
-
-        reconciles_spawned
-    }
-
-    /// `optimize` in this context means identifying shards which have valid scheduled locations, but
-    /// could be scheduled somewhere better:
-    /// - Cutting over to a secondary if the node with the secondary is more lightly loaded
-    ///    * e.g. after a node fails then recovers, to move some work back to it
-    /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
-    ///    * e.g. after a shard split, the initial attached locations will all be on the node where
-    ///      we did the split, but are probably better placed elsewhere.
-    /// - Creating new secondary locations if it improves the spreading of a sharded tenant
-    ///    * e.g. after a shard split, some locations will be on the same node (where the split
-    ///     happened), and will probably be better placed elsewhere.
-    ///
-    /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
-    /// the time of scheduling, this function looks for cases where a better-scoring location is available
-    /// according to those same soft constraints.
-    fn optimize_all(&self) -> usize {
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        let pageservers = nodes.clone();
-
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut reconciles_spawned = 0;
-
-        let mut tenant_shards: Vec<&TenantState> = Vec::new();
-
-        // Limit on how many shards' optmizations each call to this function will execute.  Combined
-        // with the frequency of background calls, this acts as an implicit rate limit that runs a small
-        // trickle of optimizations in the background, rather than executing a large number in parallel
-        // when a change occurs.
-        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
-
-        let mut work = Vec::new();
-
-        for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_zero() {
-                // Reset accumulators on the first shard in a tenant
-                schedule_context = ScheduleContext::default();
-                tenant_shards.clear();
-            }
-
-            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
-                break;
-            }
-
-            match shard.get_scheduling_policy() {
-                ShardSchedulingPolicy::Active => {
-                    // Ok to do optimization
-                }
-                ShardSchedulingPolicy::Essential
-                | ShardSchedulingPolicy::Pause
-                | ShardSchedulingPolicy::Stop => {
-                    // Policy prevents optimizing this shard.
-                    continue;
-                }
-            }
-
-            // Accumulate the schedule context for all the shards in a tenant: we must have
-            // the total view of all shards before we can try to optimize any of them.
-            schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
-            tenant_shards.push(shard);
-
-            // Once we have seen the last shard in the tenant, proceed to search across all shards
-            // in the tenant for optimizations
-            if shard.shard.number.0 == shard.shard.count.count() - 1 {
-                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
-                    // Do not start any optimizations while another change to the tenant is ongoing: this
-                    // is not necessary for correctness, but simplifies operations and implicitly throttles
-                    // optimization changes to happen in a "trickle" over time.
-                    continue;
-                }
-
-                if tenant_shards.iter().any(|s| {
-                    !matches!(s.splitting, SplitState::Idle)
-                        || matches!(s.policy, PlacementPolicy::Detached)
-                }) {
-                    // Never attempt to optimize a tenant that is currently being split, or
-                    // a tenant that is meant to be detached
-                    continue;
-                }
-
-                // TODO: optimization calculations are relatively expensive: create some fast-path for
-                // the common idle case (avoiding the search on tenants that we have recently checked)
-
-                for shard in &tenant_shards {
-                    if let Some(optimization) =
-                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
-                        // its primary location based on soft constraints, cut it over.
-                        shard.optimize_attachment(nodes, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    } else if let Some(optimization) =
-                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
-                        // better placed on another node, based on ScheduleContext, then adjust it.  This
-                        // covers cases like after a shard split, where we might have too many shards
-                        // in the same tenant with secondary locations on the node where they originally split.
-                        shard.optimize_secondary(scheduler, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    }
-
-                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
-                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
-                    // for the total number of attachments on a node (not just within a tenant.)
-                }
-            }
-        }
-
-        for (tenant_shard_id, optimization) in work {
-            let shard = tenants
-                .get_mut(&tenant_shard_id)
-                .expect("We held lock from place we got this ID");
-            shard.apply_optimization(scheduler, optimization);
-
+        for (_tenant_shard_id, shard) in tenants.iter_mut() {
            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                reconciles_spawned += 1;
            }
@@ -4227,32 +3951,6 @@ impl Service {
        reconciles_spawned
    }

-    /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
-    /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
-    /// put the system into a quiescent state where future background reconciliations won't do anything.
-    pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
-        let reconciles_spawned = self.reconcile_all();
-        if reconciles_spawned == 0 {
-            // Only optimize when we are otherwise idle
-            self.optimize_all();
-        }
-
-        let waiters = {
-            let mut waiters = Vec::new();
-            let locked = self.inner.read().unwrap();
-            for (_tenant_shard_id, shard) in locked.tenants.iter() {
-                if let Some(waiter) = shard.get_waiter() {
-                    waiters.push(waiter);
-                }
-            }
-            waiters
-        };
-
-        let waiter_count = waiters.len();
-        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
-        Ok(waiter_count)
-    }
-
    pub async fn shutdown(&self) {
        // Note that this already stops processing any results from reconciles: so
        // we do not expect that our [`TenantState`] objects will reach a neat
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,9 +7,8 @@ use std::{
 use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
-    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
-use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -117,10 +116,6 @@ pub(crate) struct TenantState {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
-
-    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
-    // be set to a non-active state to avoid making changes while the issue is fixed.
-    scheduling_policy: ShardSchedulingPolicy,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +246,8 @@ impl IntentState {

 impl Drop for IntentState {
    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
-        // We do not check this while panicking, to avoid polluting unit test failures or
-        // other assertions with this assertion's output.  It's still wrong to leak these,
-        // but if we already have a panic then we don't need to independently flag this case.
-        if !(std::thread::panicking()) {
-            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
-        }
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
    }
 }

@@ -302,26 +292,6 @@ pub enum ReconcileWaitError {
    Failed(TenantShardId, String),
 }

-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct ReplaceSecondary {
-    old_node_id: NodeId,
-    new_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
-    // Replace one of our secondary locations with a different node
-    ReplaceSecondary(ReplaceSecondary),
-    // Migrate attachment to an existing secondary location
-    MigrateAttachment(MigrateAttachment),
-}
-
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -400,7 +370,6 @@ impl TenantState {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
-            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }

@@ -456,7 +425,6 @@ impl TenantState {
    fn schedule_attached(
        &mut self,
        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
    ) -> Result<(bool, NodeId), ScheduleError> {
        // No work to do if we already have an attached tenant
        if let Some(node_id) = self.intent.attached {
@@ -470,33 +438,14 @@ impl TenantState {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }

-    pub(crate) fn schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &mut ScheduleContext,
-    ) -> Result<(), ScheduleError> {
-        let r = self.do_schedule(scheduler, context);
-
-        context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
-
-        r
-    }
-
-    pub(crate) fn do_schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
-    ) -> Result<(), ScheduleError> {
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.
@@ -504,16 +453,6 @@ impl TenantState {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.

-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
-            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
-                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
-                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
-                return Ok(());
-            }
-        }
-
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -540,13 +479,12 @@ impl TenantState {
                }

                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) =
-                    self.schedule_attached(scheduler, context)?;
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -559,7 +497,7 @@ impl TenantState {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard(&[])?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -586,167 +524,6 @@ impl TenantState {
        Ok(())
    }

-    /// Optimize attachments: if a shard has a secondary location that is preferable to
-    /// its primary location based on soft constraints, switch that secondary location
-    /// to be attached.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_attachment(
-        &self,
-        nodes: &HashMap<NodeId, Node>,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
-
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                if matches!(
-                    nodes
-                        .get(node_id)
-                        .map(|n| n.may_schedule())
-                        .unwrap_or(MaySchedule::No),
-                    MaySchedule::No
-                ) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
-            }
-        }
-
-        // Fall-through: we didn't find an optimization
-        None
-    }
-
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_secondary(
-        &self,
-        scheduler: &Scheduler,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
-            };
-
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
-            }
-        }
-
-        None
-    }
-
-    pub(crate) fn apply_optimization(
-        &mut self,
-        scheduler: &mut Scheduler,
-        optimization: ScheduleOptimization,
-    ) {
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_schedule_optimization
-            .inc();
-
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id,
-                new_attached_node_id,
-            }) => {
-                self.intent.demote_attached(old_attached_node_id);
-                self.intent
-                    .promote_attached(scheduler, new_attached_node_id);
-            }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id,
-                new_node_id,
-            }) => {
-                self.intent.remove_secondary(scheduler, old_node_id);
-                self.intent.push_secondary(scheduler, new_node_id);
-            }
-        }
-    }
-
    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -891,19 +668,6 @@ impl TenantState {
            }
        }

-        // Pre-checks done: finally check whether we may actually do the work
-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active
-            | ShardSchedulingPolicy::Essential
-            | ShardSchedulingPolicy::Pause => {}
-            ShardSchedulingPolicy::Stop => {
-                // We only reach this point if there is work to do and we're going to skip
-                // doing it: warn it obvious why this tenant isn't doing what it ought to.
-                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
-            }
-        }
-
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -1040,22 +804,6 @@ impl TenantState {
        })
    }

-    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
-    /// if it is not already running
-    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
-        if self.reconciler.is_some() {
-            Some(ReconcilerWaiter {
-                tenant_shard_id: self.tenant_shard_id,
-                seq_wait: self.waiter.clone(),
-                error_seq_wait: self.error_waiter.clone(),
-                error: self.last_error.clone(),
-                seq: self.sequence,
-            })
-        } else {
-            None
-        }
-    }
-
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +829,6 @@ impl TenantState {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }

-    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
-        self.scheduling_policy = p;
-    }
-
-    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
-        &self.scheduling_policy
-    }
-
-    pub(crate) fn from_persistent(
-        tsp: TenantShardPersistence,
-        intent: IntentState,
-    ) -> anyhow::Result<Self> {
-        let tenant_shard_id = tsp.get_tenant_shard_id()?;
-        let shard_identity = tsp.get_shard_identity()?;
-
-        Ok(Self {
-            tenant_shard_id,
-            shard: shard_identity,
-            sequence: Sequence::initial(),
-            generation: tsp.generation.map(|g| Generation::new(g as u32)),
-            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-            intent,
-            observed: ObservedState::new(),
-            config: serde_json::from_str(&tsp.config).unwrap(),
-            reconciler: None,
-            splitting: tsp.splitting,
-            waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            last_error: Arc::default(),
-            pending_compute_notification: false,
-            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-        })
-    }
-
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +840,6 @@ impl TenantState {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
-            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -1165,32 +878,6 @@ pub(crate) mod tests {
        )
    }

-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
-        let tenant_id = TenantId::generate();
-
-        (0..shard_count.count())
-            .map(|i| {
-                let shard_number = ShardNumber(i);
-
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_number,
-                    shard_count,
-                };
-                TenantState::new(
-                    tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
-                    policy.clone(),
-                )
-            })
-            .collect()
-    }
-
    /// Test the scheduling behaviors used when a tenant configured for HA is subject
    /// to nodes being marked offline.
    #[test]
@@ -1200,11 +887,10 @@ pub(crate) mod tests {
        let mut nodes = make_test_nodes(3);

        let mut scheduler = Scheduler::new(nodes.values());
-        let mut context = ScheduleContext::default();

        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
        tenant_state
-            .schedule(&mut scheduler, &mut context)
+            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");

        // Expect to initially be schedule on to different nodes
@@ -1230,7 +916,7 @@ pub(crate) mod tests {

        // Scheduling the node should promote the still-available secondary node to attached
        tenant_state
-            .schedule(&mut scheduler, &mut context)
+            .schedule(&mut scheduler)
            .expect("active nodes are available");
        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);

@@ -1294,219 +980,4 @@ pub(crate) mod tests {
        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
-
-    #[test]
-    fn scheduling_mode() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // In pause mode, schedule() shouldn't do anything
-        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_state
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(tenant_state.intent.all_pageservers().is_empty());
-
-        // In active mode, schedule() works
-        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_state
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(!tenant_state.intent.all_pageservers().is_empty());
-
-        tenant_state.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
-        );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
-        );
-
-        // Applying these optimizations should result in the end state proposed
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_secondary() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
-
-        // Since there is a node with no locations available, the node with two locations for the
-        // same tenant should generate an optimization to move one away
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
-        );
-
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    // Optimize til quiescent: this emulates what Service::optimize_all does, when
-    // called repeatedly in the background.
-    fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
-        scheduler: &mut Scheduler,
-        shards: &mut [TenantState],
-    ) {
-        let mut loop_n = 0;
-        loop {
-            let mut schedule_context = ScheduleContext::default();
-            let mut any_changed = false;
-
-            for shard in shards.iter() {
-                schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
-            }
-
-            for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-
-                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-            }
-
-            if !any_changed {
-                break;
-            }
-
-            // Assert no infinite loop
-            loop_n += 1;
-            assert!(loop_n < 1000);
-        }
-    }
-
-    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
-    /// that it converges.
-    #[test]
-    fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-
-        // Only show the scheduler a couple of nodes
-        let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
-
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
-        let mut schedule_context = ScheduleContext::default();
-        for shard in &mut shards {
-            assert!(shard
-                .schedule(&mut scheduler, &mut schedule_context)
-                .is_ok());
-        }
-
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-
-        for shard in shards.iter_mut() {
-            shard.intent.clear(&mut scheduler);
-        }
-
-        Ok(())
-    }
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
+};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1058,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1498,6 +1515,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,10 +389,6 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            image_layer_creation_check_threshold: settings
-                .remove("image_layer_creation_check_threshold")
-                .map(|x| x.parse::<u8>())
-                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -505,12 +501,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                image_layer_creation_check_threshold: settings
-                    .remove("image_layer_creation_check_threshold")
-                    .map(|x| x.parse::<u8>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
-
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -1,23 +0,0 @@
-[package]
-name = "storcon_cli"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-
-[dependencies]
-anyhow.workspace = true
-clap.workspace = true
-comfy-table.workspace = true
-hyper.workspace = true
-pageserver_api.workspace = true
-pageserver_client.workspace = true
-reqwest.workspace = true
-serde.workspace = true
-serde_json = { workspace = true, features = ["raw_value"] }
-thiserror.workspace = true
-tokio.workspace = true
-tracing.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,587 +0,0 @@
-use std::{collections::HashMap, str::FromStr};
-
-use clap::{Parser, Subcommand};
-use hyper::Method;
-use pageserver_api::{
-    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
-        TenantDescribeResponse, TenantPolicyRequest,
-    },
-    models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
-    },
-    shard::{ShardStripeSize, TenantShardId},
-};
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
-use serde::{de::DeserializeOwned, Serialize};
-use utils::id::{NodeId, TenantId};
-
-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
-};
-
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
-    /// since pageservers auto-register when they start up
-    NodeRegister {
-        #[arg(long)]
-        node_id: NodeId,
-
-        #[arg(long)]
-        listen_pg_addr: String,
-        #[arg(long)]
-        listen_pg_port: u16,
-
-        #[arg(long)]
-        listen_http_addr: String,
-        #[arg(long)]
-        listen_http_port: u16,
-    },
-
-    /// Modify a node's configuration in the storage controller
-    NodeConfigure {
-        #[arg(long)]
-        node_id: NodeId,
-
-        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
-        /// manually mark a node offline
-        #[arg(long)]
-        availability: Option<NodeAvailabilityArg>,
-        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
-        #[arg(long)]
-        scheduling: Option<NodeSchedulingPolicy>,
-    },
-    /// Modify a tenant's policies in the storage controller
-    TenantPolicy {
-        #[arg(long)]
-        tenant_id: TenantId,
-        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
-        /// or is in the normal attached state with N secondary locations (`attached:N`)
-        #[arg(long)]
-        placement: Option<PlacementPolicyArg>,
-        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
-        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
-        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
-        /// unavailable, and are only for use in emergencies.
-        #[arg(long)]
-        scheduling: Option<ShardSchedulingPolicyArg>,
-    },
-    /// List nodes known to the storage controller
-    Nodes {},
-    /// List tenants known to the storage controller
-    Tenants {},
-    /// Create a new tenant in the storage controller, and by extension on pageservers.
-    TenantCreate {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Delete a tenant in the storage controller, and by extension on pageservers.
-    TenantDelete {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Split an existing tenant into a higher number of shards than its current shard count.
-    TenantShardSplit {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        shard_count: u8,
-        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
-        #[arg(long)]
-        stripe_size: Option<u32>,
-    },
-    /// Migrate the attached location for a tenant shard to a specific pageserver.
-    TenantShardMigrate {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-        #[arg(long)]
-        node: NodeId,
-    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
-    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        config: String,
-    },
-    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
-    /// alternative to the storage controller's scheduling optimization behavior.
-    TenantScatter {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Print details about a particular tenant, including all its shards' states.
-    TenantDescribe {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-}
-
-#[derive(Parser)]
-#[command(
-    author,
-    version,
-    about,
-    long_about = "CLI for Storage Controller Support/Debug"
-)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    #[arg(long)]
-    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
-    api: Url,
-
-    #[arg(long)]
-    /// JWT token for authenticating with storage controller.  Depending on the API used, this
-    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
-    /// a token with both scopes to use with this tool.
-    jwt: Option<String>,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-#[derive(Debug, Clone)]
-struct PlacementPolicyArg(PlacementPolicy);
-
-impl FromStr for PlacementPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "detached" => Ok(Self(PlacementPolicy::Detached)),
-            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
-            _ if s.starts_with("attached:") => {
-                let mut splitter = s.split(':');
-                let _prefix = splitter.next().unwrap();
-                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
-                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
-                    None => Err(anyhow::anyhow!(
-                        "Invalid format '{s}', a valid example is 'attached:1'"
-                    )),
-                }
-            }
-            _ => Err(anyhow::anyhow!(
-                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
-
-impl FromStr for ShardSchedulingPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
-            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
-            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
-            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct NodeAvailabilityArg(NodeAvailabilityWrapper);
-
-impl FromStr for NodeAvailabilityArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
-            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into attachment service
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: hyper::Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = Cli::parse();
-
-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
-    let mut trimmed = cli.api.to_string();
-    trimmed.pop();
-    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
-
-    match cli.command {
-        Command::NodeRegister {
-            node_id,
-            listen_pg_addr,
-            listen_pg_port,
-            listen_http_addr,
-            listen_http_port,
-        } => {
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::POST,
-                    "control/v1/node".to_string(),
-                    Some(NodeRegisterRequest {
-                        node_id,
-                        listen_pg_addr,
-                        listen_pg_port,
-                        listen_http_addr,
-                        listen_http_port,
-                    }),
-                )
-                .await?;
-        }
-        Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
-                .await?;
-        }
-        Command::TenantDelete { tenant_id } => {
-            let status = vps_client
-                .tenant_delete(TenantShardId::unsharded(tenant_id))
-                .await?;
-            tracing::info!("Delete status: {}", status);
-        }
-        Command::Nodes {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
-            for node in resp {
-                table.add_row([
-                    format!("{}", node.id),
-                    node.listen_http_addr,
-                    format!("{:?}", node.scheduling),
-                    format!("{:?}", node.availability),
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::NodeConfigure {
-            node_id,
-            availability,
-            scheduling,
-        } => {
-            let req = NodeConfigureRequest {
-                node_id,
-                availability: availability.map(|a| a.0),
-                scheduling,
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/config"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::Tenants {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "TenantId",
-                "ShardCount",
-                "StripeSize",
-                "Placement",
-                "Scheduling",
-            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
-            }
-
-            println!("{table}");
-        }
-        Command::TenantPolicy {
-            tenant_id,
-            placement,
-            scheduling,
-        } => {
-            let req = TenantPolicyRequest {
-                scheduling: scheduling.map(|s| s.0),
-                placement: placement.map(|p| p.0),
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/policy"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantShardSplit {
-            tenant_id,
-            shard_count,
-            stripe_size,
-        } => {
-            let req = TenantShardSplitRequest {
-                new_shard_count: shard_count,
-                new_stripe_size: stripe_size.map(ShardStripeSize),
-            };
-
-            let response = storcon_client
-                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/shard_split"),
-                    Some(req),
-                )
-                .await?;
-            println!(
-                "Split tenant {} into {} shards: {}",
-                tenant_id,
-                shard_count,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-        Command::TenantShardMigrate {
-            tenant_shard_id,
-            node,
-        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
-
-            storcon_client
-                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantConfig { tenant_id, config } => {
-            let tenant_conf = serde_json::from_str(&config)?;
-
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: tenant_conf,
-                })
-                .await?;
-        }
-        Command::TenantScatter { tenant_id } => {
-            // Find the shards
-            let locate_response = storcon_client
-                .dispatch::<(), TenantLocateResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}/locate"),
-                    None,
-                )
-                .await?;
-            let shards = locate_response.shards;
-
-            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
-            let shard_count = shards.len();
-            for s in shards {
-                let entry = node_to_shards.entry(s.node_id).or_default();
-                entry.push(s.shard_id);
-            }
-
-            // Load list of available nodes
-            let nodes_resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            for node in nodes_resp {
-                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                    node_to_shards.entry(node.id).or_default();
-                }
-            }
-
-            let max_shard_per_node = shard_count / node_to_shards.len();
-
-            loop {
-                let mut migrate_shard = None;
-                for shards in node_to_shards.values_mut() {
-                    if shards.len() > max_shard_per_node {
-                        // Pick the emptiest
-                        migrate_shard = Some(shards.pop().unwrap());
-                    }
-                }
-                let Some(migrate_shard) = migrate_shard else {
-                    break;
-                };
-
-                // Pick the emptiest node to migrate to
-                let mut destinations = node_to_shards
-                    .iter()
-                    .map(|(k, v)| (k, v.len()))
-                    .collect::<Vec<_>>();
-                destinations.sort_by_key(|i| i.1);
-                let (destination_node, destination_count) = *destinations.first().unwrap();
-                if destination_count + 1 > max_shard_per_node {
-                    // Even the emptiest destination doesn't have space: we're done
-                    break;
-                }
-                let destination_node = *destination_node;
-
-                node_to_shards
-                    .get_mut(&destination_node)
-                    .unwrap()
-                    .push(migrate_shard);
-
-                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
-
-                storcon_client
-                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                        Method::PUT,
-                        format!("control/v1/tenant/{migrate_shard}/migrate"),
-                        Some(TenantShardMigrateRequest {
-                            tenant_shard_id: migrate_shard,
-                            node_id: destination_node,
-                        }),
-                    )
-                    .await?;
-                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
-            }
-
-            // Spread the shards across the nodes
-        }
-        Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
-            for shard in shards {
-                let secondary = shard
-                    .node_secondary
-                    .iter()
-                    .map(|n| format!("{}", n))
-                    .collect::<Vec<_>>()
-                    .join(",");
-
-                let mut status_parts = Vec::new();
-                if shard.is_reconciling {
-                    status_parts.push("reconciling");
-                }
-
-                if shard.is_pending_compute_notification {
-                    status_parts.push("pending_compute");
-                }
-
-                if shard.is_splitting {
-                    status_parts.push("splitting");
-                }
-                let status = status_parts.join(",");
-
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
-                        .unwrap_or(String::new()),
-                    secondary,
-                    shard.last_error,
-                    status,
-                ]);
-            }
-            println!("{table}");
-        }
-    }
-
-    Ok(())
-}
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -40,7 +40,7 @@ macro_rules! register_hll {
    }};

    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
    }};
 }

--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -4,7 +4,7 @@ use std::str::FromStr;
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;

 use crate::{
    models::{ShardParameters, TenantConfig},
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantPolicyRequest {
-    pub placement: Option<PlacementPolicy>,
-    pub scheduling: Option<ShardSchedulingPolicy>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {

 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
-    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }

-#[derive(Serialize, Deserialize)]
-pub struct NodeDescribeResponse {
-    pub id: NodeId,
-
-    pub availability: NodeAvailabilityWrapper,
-    pub scheduling: NodeSchedulingPolicy,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
-
-    pub scheduling_policy: ShardSchedulingPolicy,
 }

 /// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
 pub struct UtilizationScore(pub u64);

 impl UtilizationScore {
@@ -129,7 +106,7 @@ impl UtilizationScore {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Copy)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-pub enum ShardSchedulingPolicy {
-    // Normal mode: the tenant's scheduled locations may be updated at will, including
-    // for non-essential optimization.
-    Active,
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;

-    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
-    // For example, this still permits a node's attachment location to change to a secondary in
-    // response to a node failure, or to assign a new secondary if a node was removed.
-    Essential,
-
-    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
-    // unavailable, it will not be rescheduled to another node.
-    Pause,
-
-    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
-    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
-    Stop,
-}
-
-impl Default for ShardSchedulingPolicy {
-    fn default() -> Self {
-        Self::Active
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            // This is used when parsing node configuration requests from neon-local.
+            // Assume the worst possible utilisation score
+            // and let it get updated via the heartbeats.
+            "active" => Ok(Self::Active(UtilizationScore::worst())),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -301,7 +301,6 @@ pub struct TenantConfig {
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
-    pub image_layer_creation_check_threshold: Option<u8>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -565,16 +565,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

-impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
-    fn from(arr: [(&str, &str); N]) -> Self {
-        let map: HashMap<String, String> = arr
-            .iter()
-            .map(|(k, v)| (k.to_string(), v.to_string()))
-            .collect();
-        Self(map)
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,18 +182,6 @@ where
        }
    }

-    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
-    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
-        let internal = self.internal.lock().unwrap();
-        let cnt = internal.current.cnt_value();
-        drop(internal);
-        if cnt >= num {
-            Ok(())
-        } else {
-            Err(cnt)
-        }
-    }
-
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -59,7 +59,6 @@ signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
-sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
    fanout: u64,
    ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
-    let exp_base = fanout.max(2);
+    assert!(fanout >= 2);
    // Start at L0
    let mut current_level_no = 0;
    let mut current_level_target_height = target_file_size;
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            break;
        }
        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
    }
    Ok(())
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -600,37 +600,33 @@ fn start_pageserver(
            None,
            "consumption metrics collection",
            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();

-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };

-                    pageserver::consumption_metrics::collect_metrics(
-                        tenant_manager,
-                        metric_collection_endpoint,
-                        &conf.metric_collection_bucket,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        local_disk_storage,
-                        cancel,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    &conf.metric_collection_bucket,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    local_disk_storage,
+                    cancel,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
            },
        );
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -95,8 +95,6 @@ pub mod defaults {

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -158,8 +156,6 @@ pub mod defaults {
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

-#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-
 [remote_storage]

 "#
@@ -283,13 +279,6 @@ pub struct PageServerConf {
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

    pub validate_vectored_get: bool,
-
-    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
-    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
-    /// of ephemeral data.
-    ///
-    /// Setting this to zero disables limits on total ephemeral layer size.
-    pub ephemeral_bytes_per_memory_kb: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -411,8 +400,6 @@ struct PageServerConfigBuilder {
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

    validate_vectored_get: BuilderValue<bool>,
-
-    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }

 impl PageServerConfigBuilder {
@@ -499,7 +486,6 @@ impl PageServerConfigBuilder {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
-            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
        }
    }
 }
@@ -679,10 +665,6 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }

-    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
-        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -738,7 +720,6 @@ impl PageServerConfigBuilder {
                get_vectored_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
-                ephemeral_bytes_per_memory_kb,
            }
            CUSTOM LOGIC
            {
@@ -1029,9 +1010,6 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
-                "ephemeral_bytes_per_memory_kb" => {
-                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1113,7 +1091,6 @@ impl PageServerConf {
                    .expect("Invalid default constant"),
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
        }
    }
 }
@@ -1351,7 +1328,6 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1423,7 +1399,6 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,9 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{
-    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
-};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -43,7 +41,6 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
-    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
@@ -70,19 +67,15 @@ pub async fn collect_metrics(
        None,
        "synthetic size calculation",
        false,
-        {
-            let tenant_manager = tenant_manager.clone();
-            async move {
-                calculate_synthetic_size_worker(
-                    tenant_manager,
-                    synthetic_size_calculation_interval,
-                    &cancel,
-                    &worker_ctx,
-                )
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
-                Ok(())
-            }
+        async move {
+            calculate_synthetic_size_worker(
+                synthetic_size_calculation_interval,
+                &cancel,
+                &worker_ctx,
+            )
+            .instrument(info_span!("synthetic_size_worker"))
+            .await?;
+            Ok(())
        },
    );

@@ -123,7 +116,7 @@ pub async fn collect_metrics(
        let started_at = Instant::now();

        // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
+        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

        let metrics = Arc::new(metrics);

@@ -278,7 +271,6 @@ async fn reschedule(

 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
-    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
    cancel: &CancellationToken,
    ctx: &RequestContext,
@@ -291,7 +283,7 @@ async fn calculate_synthetic_size_worker(
    loop {
        let started_at = Instant::now();

-        let tenants = match tenant_manager.list_tenants() {
+        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
            Err(e) => {
                warn!("cannot get tenant list: {e:#}");
@@ -310,14 +302,10 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
+            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
                continue;
            };

-            if !tenant.is_active() {
-                continue;
-            }
-
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
@@ -355,7 +343,9 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    };

    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate.
+    // mean the synthetic size worker should terminate. we do not need any checks
+    // in this function because `mgr::get_tenant` will error out after shutdown has
+    // progressed to shutting down tenants.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,4 +1,3 @@
-use crate::tenant::mgr::TenantManager;
 use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
@@ -182,7 +181,6 @@ impl MetricsKey {
 }

 pub(super) async fn collect_all_metrics(
-    tenant_manager: &Arc<TenantManager>,
    cached_metrics: &Cache,
    ctx: &RequestContext,
 ) -> Vec<RawMetric> {
@@ -190,7 +188,7 @@ pub(super) async fn collect_all_metrics(

    let started_at = std::time::Instant::now();

-    let tenants = match tenant_manager.list_tenants() {
+    let tenants = match crate::tenant::mgr::list_tenants().await {
        Ok(tenants) => tenants,
        Err(err) => {
            tracing::error!("failed to list tenants: {:?}", err);
@@ -202,8 +200,7 @@ pub(super) async fn collect_all_metrics(
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
-            tenant_manager
-                .get_attached_tenant_shard(id)
+            crate::tenant::mgr::get_tenant(id, true)
                .ok()
                .map(|tenant| (id.tenant_id, tenant))
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,6 +61,7 @@ use crate::{
    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
+        self,
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
@@ -813,8 +814,8 @@ async fn collect_eviction_candidates(
    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
-    let tenants = tenant_manager
-        .list_tenants()
+    let tenants = tenant::mgr::list_tenants()
+        .await
        .context("get list of tenants")?;

    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -826,12 +827,8 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
-            Ok(tenant) if tenant.is_active() => tenant,
-            Ok(_) => {
-                debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
-                continue;
-            }
+        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
+            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
                debug!("failed to get tenant: {e:#}");
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1038,7 +1038,7 @@ paths:
                  format: hex
      responses:
        "201":
-          description: Timeline was created, or already existed with matching parameters
+          description: TimelineInfo
          content:
            application/json:
              schema:
@@ -1068,17 +1068,11 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
        "409":
-          description: Timeline already exists, with different parameters.  Creation cannot proceed.
+          description: Timeline already exists, creation skipped
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "429":
-          description: A creation request was sent for the same Timeline Id while a creation was already in progress.  Back off and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
        "500":
          description: Generic operation error
          content:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -49,8 +49,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
@@ -249,11 +249,16 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -264,9 +269,6 @@ impl From<GetTenantError> for ApiError {
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
-            GetActiveTenantError::Broken(reason) => {
-                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
-            }
            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
            GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -277,6 +279,19 @@ impl From<GetActiveTenantError> for ApiError {
    }
 }

+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
+            }
+            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -480,7 +495,7 @@ async fn timeline_create_handler(
    async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -520,13 +535,10 @@ async fn timeline_create_handler(
                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
                )
            }
-            Err(e @ tenant::CreateTimelineError::Conflict) => {
-                json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
-            }
-            Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
-                StatusCode::TOO_MANY_REQUESTS,
-                HttpErrorBody::from_msg(e.to_string()),
-            ),
+            Err(
+                e @ tenant::CreateTimelineError::Conflict
+                | e @ tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                StatusCode::NOT_ACCEPTABLE,
                HttpErrorBody::from_msg(format!("{err:#}")),
@@ -569,7 +581,7 @@ async fn timeline_list_handler(
    let response_data = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -607,7 +619,6 @@ async fn timeline_preserve_initdb_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    // Part of the process for disaster recovery from safekeeper-stored WAL:
    // If we don't recover into a new timeline but want to keep the timeline ID,
@@ -615,9 +626,7 @@ async fn timeline_preserve_initdb_handler(
    // location where timeline recreation cand find it.

    async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -659,7 +668,7 @@ async fn timeline_detail_handler(
    let timeline_info = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -846,7 +855,7 @@ async fn timeline_delete_handler(

    let tenant = state
        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)
+        .get_attached_tenant_shard(tenant_shard_id, false)
        .map_err(|e| {
            match e {
                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -964,11 +973,10 @@ async fn tenant_list_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
-    let state = get_state(&request);

-    let response_data = state
-        .tenant_manager
-        .list_tenants()
+    let response_data = mgr::list_tenants()
+        .instrument(info_span!("tenant_list"))
+        .await
        .map_err(|_| {
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
@@ -991,27 +999,9 @@ async fn tenant_status(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);
-
-    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
-    let activate = true;
-    #[cfg(feature = "testing")]
-    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);

    let tenant_info = async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-
-        if activate {
-            // This is advisory: we prefer to let the tenant activate on-demand when this function is
-            // called, but it is still valid to return 200 and describe the current state of the tenant
-            // if it doesn't make it into an active state.
-            tenant
-                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-                .await
-                .ok();
-        }
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -1084,7 +1074,9 @@ async fn tenant_size_handler(
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();
-    let state = get_state(&request);
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;

    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
@@ -1092,12 +1084,6 @@ async fn tenant_size_handler(
        )));
    }

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    // this can be long operation
    let inputs = tenant
        .gather_size_inputs(
@@ -1166,15 +1152,10 @@ async fn tenant_shard_split_handler(
    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    let new_shards = state
        .tenant_manager
        .shard_split(
-            tenant,
+            tenant_shard_id,
            ShardCount::new(req.new_shard_count),
            req.new_stripe_size,
            &ctx,
@@ -1392,11 +1373,8 @@ async fn get_tenant_config_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, false)?;

    let response = HashMap::from([
        (
@@ -1424,31 +1402,15 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

-    let new_tenant_conf =
+    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
-
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let tenant = state
+    state
        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.get_generation(),
-        &ShardParameters::default(),
-    );
-
-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+        .set_new_tenant_config(tenant_conf, tenant_id)
+        .instrument(info_span!("tenant_config", %tenant_id))
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -1672,12 +1634,10 @@ async fn handle_tenant_break(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;

-    let state = get_state(&r);
-    state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?
-        .set_broken("broken from test".to_owned())
-        .await;
+    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
+        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+
+    tenant.set_broken("broken from test".to_owned()).await;

    json_response(StatusCode::OK, ())
 }
@@ -1921,7 +1881,7 @@ async fn active_timeline_of_active_tenant(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1483,18 +1483,12 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });

 pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
        "Number of WAL records received from safekeepers"
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -760,7 +760,6 @@ impl PageServerHandler {
        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
        timeline
            .import_basebackup_from_tar(
-                tenant.clone(),
                &mut copyin_reader,
                base_lsn,
                self.broker_client.clone(),
@@ -876,13 +875,7 @@ impl PageServerHandler {
            if lsn <= last_record_lsn {
                lsn = last_record_lsn;
            } else {
-                timeline
-                    .wait_lsn(
-                        lsn,
-                        crate::tenant::timeline::WaitLsnWaiter::PageService,
-                        ctx,
-                    )
-                    .await?;
+                timeline.wait_lsn(lsn, ctx).await?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -894,13 +887,7 @@ impl PageServerHandler {
                    "invalid LSN(0) in request".into(),
                ));
            }
-            timeline
-                .wait_lsn(
-                    lsn,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    ctx,
-                )
-                .await?;
+            timeline.wait_lsn(lsn, ctx).await?;
        }

        if lsn < **latest_gc_cutoff_lsn {
@@ -1227,13 +1214,7 @@ impl PageServerHandler {
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
-            timeline
-                .wait_lsn(
-                    lsn,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    ctx,
-                )
-                .await?;
+            timeline.wait_lsn(lsn, ctx).await?;
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -214,12 +214,13 @@ pub enum TaskKind {
    /// Internally, `Client` hands over requests to the `Connection` object.
    /// The `Connection` object is responsible for speaking the wire protocol.
    ///
-    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr`.
    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
-    /// Once the connection is established, the `TaskHandle` task spawns a
-    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -229,6 +230,7 @@ pub enum TaskKind {
    WalReceiverManager,

    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
+    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
    ///
    /// [`WalReceiverManager`]: Self::WalReceiverManager
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,7 +12,6 @@
 //!

 use anyhow::{bail, Context};
-use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -99,7 +98,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::Mutex;
+use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use crate::span;
@@ -261,7 +260,7 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
+    tenant_conf: Arc<RwLock<AttachedTenantConf>>,

    tenant_shard_id: TenantShardId,

@@ -1412,7 +1411,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        &self,
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Option<TimelineId>,
        mut ancestor_start_lsn: Option<Lsn>,
@@ -1516,7 +1515,7 @@ impl Tenant {
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
                    ancestor_timeline
-                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
+                        .wait_lsn(*lsn, ctx)
                        .await
                        .map_err(|e| match e {
                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1560,7 +1559,7 @@ impl Tenant {
            })?;
        }

-        loaded_timeline.activate(self.clone(), broker_client, None, ctx);
+        loaded_timeline.activate(broker_client, None, ctx);

        Ok(loaded_timeline)
    }
@@ -1607,7 +1606,7 @@ impl Tenant {
        );

        {
-            let conf = self.tenant_conf.load();
+            let conf = self.tenant_conf.read().unwrap();

            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
@@ -1634,7 +1633,7 @@ impl Tenant {
        }

        {
-            let conf = self.tenant_conf.load();
+            let conf = self.tenant_conf.read().unwrap();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(());
@@ -1732,12 +1731,7 @@ impl Tenant {
            let mut activated_timelines = 0;

            for timeline in timelines_to_activate {
-                timeline.activate(
-                    self.clone(),
-                    broker_client.clone(),
-                    background_jobs_can_start,
-                    ctx,
-                );
+                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
                activated_timelines += 1;
            }

@@ -1783,7 +1777,7 @@ impl Tenant {
    async fn shutdown(
        &self,
        shutdown_progress: completion::Barrier,
-        shutdown_mode: timeline::ShutdownMode,
+        freeze_and_flush: bool,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();

@@ -1830,8 +1824,16 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let timeline_id = timeline.timeline_id;
-                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
-                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
+
+                let span =
+                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
+                js.spawn(async move {
+                    if freeze_and_flush {
+                        timeline.flush_and_shutdown().instrument(span).await
+                    } else {
+                        timeline.shutdown().instrument(span).await
+                    }
+                });
            })
        };
        // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2061,12 +2063,7 @@ impl Tenant {
                TenantState::Active { .. } => {
                    return Ok(());
                }
-                TenantState::Broken { reason, .. } => {
-                    // This is fatal, and reported distinctly from the general case of "will never be active" because
-                    // it's logically a 500 to external API users (broken is always a bug).
-                    return Err(GetActiveTenantError::Broken(reason));
-                }
-                TenantState::Stopping { .. } => {
+                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
@@ -2075,14 +2072,14 @@ impl Tenant {
    }

    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.load().location.attach_mode
+        self.tenant_conf.read().unwrap().location.attach_mode
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
    /// rare external API calls, like a reconciliation at startup.
    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.load();
+        let conf = self.tenant_conf.read().unwrap();

        let location_config_mode = match conf.location.attach_mode {
            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2229,7 +2226,7 @@ where

 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.load().tenant_conf.clone()
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
    }

    pub fn effective_config(&self) -> TenantConf {
@@ -2238,84 +2235,84 @@ impl Tenant {
    }

    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }

    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }

    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

    pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        let heatmap_period = tenant_conf
            .heatmap_period
            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2327,40 +2324,26 @@ impl Tenant {
    }

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
-        // this race is not possible if both request types come from the storage
-        // controller (as they should!) because an exclusive op lock is required
-        // on the storage controller side.
-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-            })
-        });
-
-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        self.tenant_conf_updated();
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated();
        }
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        let new_tenant_conf = new_conf.tenant_conf.clone();
-
-        self.tenant_conf.store(Arc::new(new_conf));
-
-        self.tenant_conf_updated(&new_tenant_conf);
+        *self.tenant_conf.write().unwrap() = new_conf;
+        self.tenant_conf_updated();
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated();
        }
    }

@@ -2374,8 +2357,11 @@ impl Tenant {
            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
    }

-    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
+    pub(crate) fn tenant_conf_updated(&self) {
+        let conf = {
+            let guard = self.tenant_conf.read().unwrap();
+            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
+        };
        self.timeline_get_throttle.reconfigure(conf)
    }

@@ -2523,7 +2509,7 @@ impl Tenant {
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
-            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
+            tenant_conf: Arc::new(RwLock::new(attached_conf)),
        }
    }

@@ -3509,7 +3495,7 @@ impl Tenant {
    }

    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.load().tenant_conf.clone()
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
    }
 }

@@ -3657,9 +3643,6 @@ pub(crate) mod harness {
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
-                image_layer_creation_check_threshold: Some(
-                    tenant_conf.image_layer_creation_check_threshold,
-                ),
            }
        }
    }
@@ -3858,7 +3841,6 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
-    use tests::timeline::ShutdownMode;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4304,7 +4286,7 @@ mod tests {
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
+                .shutdown(Default::default(), true)
                .instrument(harness.span())
                .await
                .ok()
@@ -4345,7 +4327,7 @@ mod tests {

            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
+                .shutdown(Default::default(), true)
                .instrument(harness.span())
                .await
                .ok()
@@ -5126,7 +5108,7 @@ mod tests {
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown(super::timeline::ShutdownMode::Hard)
+                .shutdown()
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,9 +57,6 @@ pub mod defaults {
    // throughputs up to 1GiB/s per timeline.
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -365,10 +362,6 @@ pub struct TenantConf {
    pub lazy_slru_download: bool,

    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -461,9 +454,6 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub image_layer_creation_check_threshold: Option<u8>,
 }

 impl TenantConfOpt {
@@ -518,9 +508,6 @@ impl TenantConfOpt {
                .timeline_get_throttle
                .clone()
                .unwrap_or(global_conf.timeline_get_throttle),
-            image_layer_creation_check_threshold: self
-                .image_layer_creation_check_threshold
-                .unwrap_or(global_conf.image_layer_creation_check_threshold),
        }
    }
 }
@@ -561,7 +548,6 @@ impl Default for TenantConf {
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
        }
    }
 }
@@ -635,7 +621,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
-            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,10 +14,7 @@ use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    tenant::{
-        mgr::{TenantSlot, TenantsMapRemoveResult},
-        timeline::ShutdownMode,
-    },
+    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
 };

 use super::{
@@ -466,7 +463,7 @@ impl DeleteTenantFlow {
        // tenant.shutdown
        // Its also bad that we're holding tenants.read here.
        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
+        if tenant.shutdown(progress, false).await.is_err() {
            return Err(DeleteTenantError::Other(anyhow::anyhow!(
                "tenant shutdown is already in progress"
            )));
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,10 +72,6 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) fn id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,6 +346,35 @@ where
    }
 }

+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub enum InMemoryLayerHandle {
+    Open {
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+    Frozen {
+        idx: usize,
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+}
+
+impl InMemoryLayerHandle {
+    pub fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
+            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
+        }
+    }
+
+    pub fn get_end_lsn(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
+            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
+        }
+    }
+}
+
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -547,18 +576,41 @@ impl LayerMap {
        self.historic.iter()
    }

-    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
+    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    ///
+    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
+    /// the same exclusive region established by holding the layer manager lock.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
    where
        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
    {
        if let Some(open) = &self.open_layer {
            if pred(open) {
-                return Some(open.clone());
+                return Some(InMemoryLayerHandle::Open {
+                    lsn_floor: open.get_lsn_range().start,
+                    end_lsn: open.get_lsn_range().end,
+                });
            }
        }

-        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
+        let pos = self.frozen_layers.iter().rev().position(pred);
+        pos.map(|rev_idx| {
+            let idx = self.frozen_layers.len() - 1 - rev_idx;
+            InMemoryLayerHandle::Frozen {
+                idx,
+                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
+                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
+            }
+        })
+    }
+
+    /// Get the layer pointed to by the provided handle.
+    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
+        match handle {
+            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
+            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
+        }
    }

    ///
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -4,7 +4,7 @@
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::LocationConfigMode;
+use pageserver_api::models::{LocationConfigMode, ShardParameters};
 use pageserver_api::shard::{
    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
@@ -16,7 +16,6 @@ use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use sysinfo::SystemExt;
 use tokio::fs;
 use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

@@ -40,11 +39,10 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
+    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
-use crate::tenant::storage_layer::inmemory_layer;
-use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -545,18 +543,6 @@ pub async fn init_tenant_mgr(

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);

-    // Initialize dynamic limits that depend on system resources
-    let system_memory =
-        sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
-            .total_memory();
-    let max_ephemeral_layer_bytes =
-        conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
-    tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
-    inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
-        max_ephemeral_layer_bytes,
-        std::sync::atomic::Ordering::Relaxed,
-    );
-
    // Scan local filesystem for attached tenants
    let tenant_configs = init_load_tenant_configs(conf).await?;

@@ -784,9 +770,11 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                            join_set.spawn(
                                async move {
+                                    let freeze_and_flush = true;
+
                                    let res = {
                                        let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
+                                        t.shutdown(shutdown_progress, freeze_and_flush).await
                                    };

                                    if let Err(other_progress) = res {
@@ -887,6 +875,16 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+    #[error(transparent)]
+    Other(anyhow::Error),
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
    #[error("Bad config request: {0}")]
@@ -912,21 +910,32 @@ impl TenantManager {
        self.conf
    }

-    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
-    /// undergoing a state change (i.e. slot is InProgress).
-    ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
+    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
+        active_only: bool,
    ) -> Result<Arc<Tenant>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;

        match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
+            Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
+                TenantState::Broken {
+                    reason,
+                    backtrace: _,
+                } if active_only => Err(GetTenantError::Broken(reason)),
+                TenantState::Active => Ok(Arc::clone(tenant)),
+                _ => {
+                    if active_only {
+                        Err(GetTenantError::NotActive(tenant_shard_id))
+                    } else {
+                        Ok(Arc::clone(tenant))
+                    }
+                }
+            },
            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
@@ -1106,7 +1115,7 @@ impl TenantManager {
                };

                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                match tenant.shutdown(progress, false).await {
                    Ok(()) => {}
                    Err(barrier) => {
                        info!("Shutdown already in progress, waiting for it to complete");
@@ -1222,7 +1231,7 @@ impl TenantManager {
                    TenantSlot::Attached(tenant) => {
                        let (_guard, progress) = utils::completion::channel();
                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                        match tenant.shutdown(progress, false).await {
                            Ok(()) => {
                                info!("Finished shutting down just-spawned tenant");
                            }
@@ -1272,7 +1281,7 @@ impl TenantManager {
        };

        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+        match tenant.shutdown(progress, false).await {
            Ok(()) => {
                slot_guard.drop_old_value()?;
            }
@@ -1419,8 +1428,7 @@ impl TenantManager {
                    .wait_to_become_active(activation_timeout)
                    .await
                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_)
-                        | GetActiveTenantError::Broken(_) => {
+                        GetActiveTenantError::WillNotBecomeActive(_) => {
                            DeleteTenantError::InvalidState(tenant.current_state())
                        }
                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
@@ -1447,30 +1455,29 @@ impl TenantManager {
        result
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant_shard_id: TenantShardId,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
        let r = self
-            .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
+            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
            .await;
        if r.is_err() {
            // Shard splitting might have left the original shard in a partially shut down state (it
            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
            // a working state.
            if self.get(tenant_shard_id).is_some() {
-                tracing::warn!("Resetting after shard split failure");
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
                    // Log this error because our return value will still be the original error, not this one.  This is
                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
                    // setting it broken probably won't help either.
-                    tracing::error!("Failed to reset: {e}");
+                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
                }
            }
        }
@@ -1480,12 +1487,12 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant_shard_id: TenantShardId,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        let tenant = get_tenant(tenant_shard_id, true)?;

        // Validate the incoming request
        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
@@ -1531,6 +1538,7 @@ impl TenantManager {
            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
            // have been left in a partially-shut-down state.
            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
+            self.reset_tenant(tenant_shard_id, false, ctx).await?;
            return Err(e);
        }

@@ -1648,14 +1656,7 @@ impl TenantManager {
                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                        "failpoint"
                    )));
-                    if let Err(e) = timeline
-                        .wait_lsn(
-                            *target_lsn,
-                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
-                            ctx,
-                        )
-                        .await
-                    {
+                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
                        tracing::warn!(
@@ -1676,7 +1677,7 @@ impl TenantManager {

        // Phase 5: Shut down the parent shard, and erase it from disk
        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, ShutdownMode::Hard).await {
+        match parent.shutdown(progress, false).await {
            Ok(()) => {}
            Err(other) => {
                other.wait().await;
@@ -1935,23 +1936,38 @@ impl TenantManager {
        removal_result
    }

-    pub(crate) fn list_tenants(
+    pub(crate) async fn set_new_tenant_config(
        &self,
-    ) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
-        let tenants = TENANTS.read().unwrap();
-        let m = match &*tenants {
-            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
-        };
-        Ok(m.iter()
-            .filter_map(|(id, tenant)| match tenant {
-                TenantSlot::Attached(tenant) => {
-                    Some((*id, tenant.current_state(), tenant.generation()))
-                }
-                TenantSlot::Secondary(_) => None,
-                TenantSlot::InProgress(_) => None,
-            })
-            .collect())
+        new_tenant_conf: TenantConfOpt,
+        tenant_id: TenantId,
+    ) -> Result<(), SetNewTenantConfigError> {
+        // Legacy API: does not support sharding
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+        info!("configuring tenant {tenant_id}");
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
+            // Note that we use ShardParameters::default below.
+            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+        }
+
+        // This is a legacy API that only operates on attached tenants: the preferred
+        // API to use is the location_config/ endpoint, which lets the caller provide
+        // the full LocationConf.
+        let location_conf = LocationConf::attached_single(
+            new_tenant_conf.clone(),
+            tenant.generation,
+            &ShardParameters::default(),
+        );
+
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
+        tenant.set_new_tenant_config(new_tenant_conf);
+        Ok(())
    }
 }

@@ -1964,12 +1980,51 @@ pub(crate) enum GetTenantError {

    #[error("Tenant {0} is not active")]
    NotActive(TenantShardId),
+    /// Broken is logically a subset of NotActive, but a distinct error is useful as
+    /// NotActive is usually a retryable state for API purposes, whereas Broken
+    /// is a stuck error state
+    #[error("Tenant is broken: {0}")]
+    Broken(String),

    // Initializing or shutting down: cannot authoritatively say whether we have this tenant
    #[error("Tenant map is not available: {0}")]
    MapState(#[from] TenantMapError),
 }

+/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
+/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+///
+/// This method is cancel-safe.
+pub(crate) fn get_tenant(
+    tenant_shard_id: TenantShardId,
+    active_only: bool,
+) -> Result<Arc<Tenant>, GetTenantError> {
+    let locked = TENANTS.read().unwrap();
+
+    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
+
+    match peek_slot {
+        Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
+            TenantState::Broken {
+                reason,
+                backtrace: _,
+            } if active_only => Err(GetTenantError::Broken(reason)),
+            TenantState::Active => Ok(Arc::clone(tenant)),
+            _ => {
+                if active_only {
+                    Err(GetTenantError::NotActive(tenant_shard_id))
+                } else {
+                    Ok(Arc::clone(tenant))
+                }
+            }
+        },
+        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
+        None | Some(TenantSlot::Secondary(_)) => {
+            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetActiveTenantError {
    /// We may time out either while TenantSlot is InProgress, or while the Tenant
@@ -1993,12 +2048,6 @@ pub(crate) enum GetActiveTenantError {
    /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
    #[error("will not become active.  Current state: {0}")]
    WillNotBecomeActive(TenantState),
-
-    /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
-    /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
-    /// never happen.
-    #[error("Tenant is broken: {0}")]
-    Broken(String),
 }

 /// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
@@ -2218,6 +2267,27 @@ pub(crate) enum TenantMapListError {
    Initializing,
 }

+///
+/// Get list of tenants, for the mgmt API
+///
+pub(crate) async fn list_tenants(
+) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
+    let tenants = TENANTS.read().unwrap();
+    let m = match &*tenants {
+        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+    };
+    Ok(m.iter()
+        .filter_map(|(id, tenant)| match tenant {
+            TenantSlot::Attached(tenant) => {
+                Some((*id, tenant.current_state(), tenant.generation()))
+            }
+            TenantSlot::Secondary(_) => None,
+            TenantSlot::InProgress(_) => None,
+        })
+        .collect())
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapInsertError {
    #[error(transparent)]
@@ -2663,11 +2733,11 @@ where
    let attached_tenant = match slot_guard.get_old_value() {
        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let shutdown_mode = ShutdownMode::Hard;
+            let freeze_and_flush = false;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, shutdown_mode).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1569,7 +1569,7 @@ impl RemoteTimelineClient {
    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
    ///
    /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
    pub(crate) fn stop(&self) {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -786,35 +786,6 @@ impl<'a> TenantDownloader<'a> {
            // Existing on-disk layers: just update their access time.
            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                tracing::debug!("Layer {} is already on disk", layer.name);
-
-                if cfg!(debug_assertions) {
-                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
-                    // are already present on disk are really there.
-                    let local_path = self
-                        .conf
-                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
-                        .join(layer.name.file_name());
-                    match tokio::fs::metadata(&local_path).await {
-                        Ok(meta) => {
-                            tracing::debug!(
-                                "Layer {} present at {}, size {}",
-                                layer.name,
-                                local_path,
-                                meta.len(),
-                            );
-                        }
-                        Err(e) => {
-                            tracing::warn!(
-                                "Layer {} not found at {} ({})",
-                                layer.name,
-                                local_path,
-                                e
-                            );
-                            debug_assert!(false);
-                        }
-                    }
-                }
-
                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                    || on_disk.access_time != layer.access_time
                {
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,7 +9,6 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
-        mgr::GetTenantError,
        mgr::TenantManager,
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
@@ -293,11 +292,8 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            "Starting heatmap write on command");
        let tenant = self
            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id)
+            .get_attached_tenant_shard(*tenant_shard_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;
-        if !tenant.is_active() {
-            return Err(GetTenantError::NotActive(*tenant_shard_id).into());
-        }

        Ok(UploadPending {
            // Ignore our state for last digest: this forces an upload even if nothing has changed
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -3,7 +3,7 @@
 pub mod delta_layer;
 mod filename;
 pub mod image_layer;
-pub(crate) mod inmemory_layer;
+mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;

@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};

 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

-use self::inmemory_layer::InMemoryLayerFileId;
-
+use super::layer_map::InMemoryLayerHandle;
+use super::timeline::layer_manager::LayerManager;
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;

@@ -204,30 +204,23 @@ impl Default for ValuesReconstructState {
    }
 }

-/// A key that uniquely identifies a layer in a timeline
-#[derive(Debug, PartialEq, Eq, Clone, Hash)]
-pub(crate) enum LayerId {
-    PersitentLayerId(PersistentLayerKey),
-    InMemoryLayerId(InMemoryLayerFileId),
+/// Description of layer to be read - the layer map can turn
+/// this description into the actual layer.
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum ReadableLayerDesc {
+    Persistent {
+        desc: PersistentLayerDesc,
+        lsn_range: Range<Lsn>,
+    },
+    InMemory {
+        handle: InMemoryLayerHandle,
+        lsn_ceil: Lsn,
+    },
 }

-/// Layer wrapper for the read path. Note that it is valid
-/// to use these layers even after external operations have
-/// been performed on them (compaction, freeze, etc.).
+/// Wraper for 'ReadableLayerDesc' sorted by Lsn
 #[derive(Debug)]
-pub(crate) enum ReadableLayer {
-    PersistentLayer(Layer),
-    InMemoryLayer(Arc<InMemoryLayer>),
-}
-
-/// A partial description of a read to be done.
-#[derive(Debug, Clone)]
-struct ReadDesc {
-    /// An id used to resolve the readable layer within the fringe
-    layer_id: LayerId,
-    /// Lsn range for the read, used for selecting the next read
-    lsn_range: Range<Lsn>,
-}
+struct ReadableLayerDescOrdered(ReadableLayerDesc);

 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -238,64 +231,41 @@ struct ReadDesc {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<LayerId, LayerKeyspace>,
-}
-
-#[derive(Debug)]
-struct LayerKeyspace {
-    layer: ReadableLayer,
-    target_keyspace: KeySpace,
+    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    layers: HashMap<ReadableLayerDesc, KeySpace>,
 }

 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            planned_reads_by_lsn: BinaryHeap::new(),
+            layers_by_lsn: BinaryHeap::new(),
            layers: HashMap::new(),
        }
    }

-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_reads_by_lsn.pop() {
-            Some(desc) => desc,
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+        let handle = match self.layers_by_lsn.pop() {
+            Some(h) => h,
            None => return None,
        };

-        let removed = self.layers.remove_entry(&read_desc.layer_id);
+        let removed = self.layers.remove_entry(&handle.0);
        match removed {
-            Some((
-                _,
-                LayerKeyspace {
-                    layer,
-                    target_keyspace,
-                },
-            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
+            Some((layer, keyspace)) => Some((layer, keyspace)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }

-    pub(crate) fn update(
-        &mut self,
-        layer: ReadableLayer,
-        keyspace: KeySpace,
-        lsn_range: Range<Lsn>,
-    ) {
-        let layer_id = layer.id();
-        let entry = self.layers.entry(layer_id.clone());
+    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+        let entry = self.layers.entry(layer.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.merge(&keyspace);
+                entry.get_mut().merge(&keyspace);
            }
            Entry::Vacant(entry) => {
-                self.planned_reads_by_lsn.push(ReadDesc {
-                    lsn_range,
-                    layer_id: layer_id.clone(),
-                });
-                entry.insert(LayerKeyspace {
-                    layer,
-                    target_keyspace: keyspace,
-                });
+                self.layers_by_lsn
+                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                entry.insert(keyspace);
            }
        }
    }
@@ -307,55 +277,77 @@ impl Default for LayerFringe {
    }
 }

-impl Ord for ReadDesc {
+impl Ord for ReadableLayerDescOrdered {
    fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
+        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
        if ord == std::cmp::Ordering::Equal {
-            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
+            self.0
+                .get_lsn_floor()
+                .cmp(&other.0.get_lsn_floor())
+                .reverse()
        } else {
            ord
        }
    }
 }

-impl PartialOrd for ReadDesc {
+impl PartialOrd for ReadableLayerDescOrdered {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }

-impl PartialEq for ReadDesc {
+impl PartialEq for ReadableLayerDescOrdered {
    fn eq(&self, other: &Self) -> bool {
-        self.lsn_range == other.lsn_range
+        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
    }
 }

-impl Eq for ReadDesc {}
+impl Eq for ReadableLayerDescOrdered {}

-impl ReadableLayer {
-    pub(crate) fn id(&self) -> LayerId {
+impl ReadableLayerDesc {
+    pub(crate) fn get_lsn_floor(&self) -> Lsn {
        match self {
-            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
-            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
+            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+        }
+    }
+
+    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
+            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
        }
    }

    pub(crate) async fn get_values_reconstruct_data(
        &self,
+        layer_manager: &LayerManager,
        keyspace: KeySpace,
-        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayer::PersistentLayer(layer) => {
+            ReadableLayerDesc::Persistent { desc, lsn_range } => {
+                let layer = layer_manager.get_from_desc(desc);
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(
+                        keyspace,
+                        lsn_range.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
                    .await
            }
-            ReadableLayer::InMemoryLayer(layer) => {
+            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+                let layer = layer_manager
+                    .layer_map()
+                    .get_in_memory_layer(handle)
+                    .unwrap();
+
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -47,7 +47,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -947,34 +946,6 @@ impl DeltaLayerInner {
        Ok(planner.finish())
    }

-    fn get_min_read_buffer_size(
-        planned_reads: &[VectoredRead],
-        read_size_soft_max: usize,
-    ) -> usize {
-        let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
-            return read_size_soft_max;
-        };
-
-        let largest_read_size = largest_read.size();
-        if largest_read_size > read_size_soft_max {
-            // If the read is oversized, it should only contain one key.
-            let offenders = largest_read
-                .blobs_at
-                .as_slice()
-                .iter()
-                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
-                .join(", ");
-            tracing::warn!(
-                "Oversized vectored read ({} > {}) for keys {}",
-                largest_read_size,
-                read_size_soft_max,
-                offenders
-            );
-        }
-
-        largest_read_size
-    }
-
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -988,8 +959,7 @@ impl DeltaLayerInner {
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
-        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));
+        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -1016,7 +986,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
+                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));

                    continue;
                }
@@ -1240,16 +1210,9 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 mod test {
    use std::collections::BTreeMap;

-    use itertools::MinMaxResult;
-    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
-    use rand::RngCore;
-
    use super::*;
    use crate::{
-        context::DownloadBehavior,
-        task_mgr::TaskKind,
-        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
-        DEFAULT_PG_VERSION,
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
    };

    /// Construct an index for a fictional delta layer and and then
@@ -1369,229 +1332,4 @@ mod test {

        assert_eq!(planned_blobs, expected_blobs);
    }
-
-    mod constants {
-        use utils::lsn::Lsn;
-
-        /// Offset used by all lsns in this test
-        pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
-        /// Number of unique keys including in the test data
-        pub(super) const KEY_COUNT: u8 = 60;
-        /// Max number of different lsns for each key
-        pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
-        /// Possible value sizes for each key along with a probability weight
-        pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
-        /// Probability that there will be a gap between the current key and the next one (33.3%)
-        pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
-        /// The minimum size of a key range in all the generated reads
-        pub(super) const MIN_RANGE_SIZE: i128 = 10;
-        /// The number of ranges included in each vectored read
-        pub(super) const RANGES_COUNT: u8 = 2;
-        /// The number of vectored reads performed
-        pub(super) const READS_COUNT: u8 = 100;
-        /// Soft max size of a vectored read. Will be violated if we have to read keys
-        /// with values larger than the limit
-        pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
-    }
-
-    struct Entry {
-        key: Key,
-        lsn: Lsn,
-        value: Vec<u8>,
-    }
-
-    fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
-        let mut current_key = Key::MIN;
-
-        let mut entries = Vec::new();
-        for _ in 0..constants::KEY_COUNT {
-            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
-            let mut lsns_iter =
-                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
-                    Some(Lsn(lsn.0 + 0x08))
-                });
-            let mut lsns = Vec::new();
-            while lsns.len() < count as usize {
-                let take = rng.gen_bool(0.5);
-                let lsn = lsns_iter.next().unwrap();
-                if take {
-                    lsns.push(lsn);
-                }
-            }
-
-            for lsn in lsns {
-                let size = constants::VALUE_SIZES
-                    .choose_weighted(rng, |item| item.1)
-                    .unwrap()
-                    .0;
-                let mut buf = vec![0; size];
-                rng.fill_bytes(&mut buf);
-
-                entries.push(Entry {
-                    key: current_key,
-                    lsn,
-                    value: buf,
-                })
-            }
-
-            let gap = constants::KEY_GAP_CHANGES
-                .choose_weighted(rng, |item| item.1)
-                .unwrap()
-                .0;
-            if gap {
-                current_key = current_key.add(2);
-            } else {
-                current_key = current_key.add(1);
-            }
-        }
-
-        entries
-    }
-
-    struct EntriesMeta {
-        key_range: Range<Key>,
-        lsn_range: Range<Lsn>,
-        index: BTreeMap<(Key, Lsn), Vec<u8>>,
-    }
-
-    fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
-        let key_range = match entries.iter().minmax_by_key(|e| e.key) {
-            MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
-            _ => panic!("More than one entry is always expected"),
-        };
-
-        let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
-            MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
-            _ => panic!("More than one entry is always expected"),
-        };
-
-        let mut index = BTreeMap::new();
-        for entry in entries.iter() {
-            index.insert((entry.key, entry.lsn), entry.value.clone());
-        }
-
-        EntriesMeta {
-            key_range,
-            lsn_range,
-            index,
-        }
-    }
-
-    fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
-        let start = key_range.start.to_i128();
-        let end = key_range.end.to_i128();
-
-        let mut keyspace = KeySpace::default();
-
-        for _ in 0..constants::RANGES_COUNT {
-            let mut range: Option<Range<Key>> = Option::default();
-            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
-                let range_start = rng.gen_range(start..end);
-                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
-                if range_end_offset >= end {
-                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
-                } else {
-                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
-                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
-                }
-            }
-            keyspace.ranges.push(range.unwrap());
-        }
-
-        keyspace
-    }
-
-    #[tokio::test]
-    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
-        let (tenant, ctx) = harness.load().await;
-
-        let timeline_id = TimelineId::generate();
-        let timeline = tenant
-            .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        tracing::info!("Generating test data ...");
-
-        let rng = &mut StdRng::seed_from_u64(0);
-        let entries = generate_entries(rng);
-        let entries_meta = get_entries_meta(&entries);
-
-        tracing::info!("Done generating {} entries", entries.len());
-
-        tracing::info!("Writing test data to delta layer ...");
-        let mut writer = DeltaLayerWriter::new(
-            harness.conf,
-            timeline_id,
-            harness.tenant_shard_id,
-            entries_meta.key_range.start,
-            entries_meta.lsn_range.clone(),
-        )
-        .await?;
-
-        for entry in entries {
-            let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
-                .await;
-            res?;
-        }
-
-        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
-
-        let inner = resident.get_inner_delta(&ctx).await?;
-
-        let file_size = inner.file.metadata().await?.len();
-        tracing::info!(
-            "Done writing test data to delta layer. Resulting file size is: {}",
-            file_size
-        );
-
-        for i in 0..constants::READS_COUNT {
-            tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
-
-            let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
-            let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-                inner.index_start_blk,
-                inner.index_root_blk,
-                block_reader,
-            );
-
-            let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
-            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
-
-            let vectored_reads = DeltaLayerInner::plan_reads(
-                keyspace.clone(),
-                entries_meta.lsn_range.clone(),
-                data_end_offset,
-                index_reader,
-                planner,
-                &mut reconstruct_state,
-                &ctx,
-            )
-            .await?;
-
-            let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
-            let buf_size = DeltaLayerInner::get_min_read_buffer_size(
-                &vectored_reads,
-                constants::MAX_VECTORED_READ_BYTES,
-            );
-            let mut buf = Some(BytesMut::with_capacity(buf_size));
-
-            for read in vectored_reads {
-                let blobs_buf = vectored_blob_reader
-                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
-                    .await?;
-                for meta in blobs_buf.blobs.iter() {
-                    let value = &blobs_buf.buf[meta.start..meta.end];
-                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
-                }
-
-                buf = Some(blobs_buf.buf);
-            }
-        }
-
-        Ok(())
-    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -44,7 +44,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -541,25 +540,7 @@ impl ImageLayerInner {

        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
-            let buf_size = read.size();
-
-            if buf_size > max_vectored_read_bytes {
-                // If the read is oversized, it should only contain one key.
-                let offenders = read
-                    .blobs_at
-                    .as_slice()
-                    .iter()
-                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
-                    .join(", ");
-                tracing::warn!(
-                    "Oversized vectored read ({} > {}) for keys {}",
-                    buf_size,
-                    max_vectored_read_bytes,
-                    offenders
-                );
-            }
-
-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
            let res = vectored_blob_reader.read_blobs(&read, buf).await;

            match res {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{page_cache, walrecord};
+use crate::walrecord;
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -36,14 +36,10 @@ use super::{
    ValuesReconstructState,
 };

-#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
-pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
-
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
-    file_id: InMemoryLayerFileId,

    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
@@ -93,10 +89,7 @@ impl std::fmt::Debug for InMemoryLayerInner {
 ///
 /// This global state is used to implement behaviors that require a global view of the system, e.g.
 /// rolling layers proactively to limit the total amount of dirty data.
-pub(crate) struct GlobalResources {
-    // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
-    // Zero means unlimited.
-    pub(crate) max_dirty_bytes: AtomicU64,
+struct GlobalResources {
    // How many bytes are in all EphemeralFile objects
    dirty_bytes: AtomicU64,
    // How many layers are contributing to dirty_bytes
@@ -125,12 +118,11 @@ impl GlobalResourceUnits {

    /// Do not call this frequently: all timelines will write to these same global atomics,
    /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
-    ///
-    /// Returns the effective layer size limit that should be applied, if any, to keep
-    /// the total number of dirty bytes below the configured maximum.
-    fn publish_size(&mut self, size: u64) -> Option<u64> {
+    fn publish_size(&mut self, size: u64) {
        let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
-            Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
+            Ordering::Equal => {
+                return;
+            }
            Ordering::Greater => {
                let delta = size - self.dirty_bytes;
                let old = GLOBAL_RESOURCES
@@ -154,21 +146,6 @@ impl GlobalResourceUnits {
        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);

        self.dirty_bytes = size;
-
-        let max_dirty_bytes = GLOBAL_RESOURCES
-            .max_dirty_bytes
-            .load(AtomicOrdering::Relaxed);
-        if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
-            // Set the layer file limit to the average layer size: this implies that all above-average
-            // sized layers will be elegible for freezing.  They will be frozen in the order they
-            // next enter publish_size.
-            Some(
-                new_global_dirty_bytes
-                    / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
-            )
-        } else {
-            None
-        }
    }

    // Call publish_size if the input size differs from last published size by more than
@@ -197,17 +174,12 @@ impl Drop for GlobalResourceUnits {
    }
 }

-pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
-    max_dirty_bytes: AtomicU64::new(0),
+static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
    dirty_bytes: AtomicU64::new(0),
    dirty_layers: AtomicUsize::new(0),
 };

 impl InMemoryLayer {
-    pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
-        self.file_id
-    }
-
    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }
@@ -222,10 +194,6 @@ impl InMemoryLayer {
        }
    }

-    pub(crate) fn try_len(&self) -> Option<u64> {
-        self.inner.try_read().map(|i| i.file.len()).ok()
-    }
-
    pub(crate) fn assert_writable(&self) {
        assert!(self.end_lsn.get().is_none());
    }
@@ -451,10 +419,8 @@ impl InMemoryLayer {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.id());

        Ok(InMemoryLayer {
-            file_id: key,
            conf,
            timeline_id,
            tenant_shard_id,
@@ -520,10 +486,10 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub(crate) async fn tick(&self) -> Option<u64> {
+    pub(crate) async fn tick(&self) {
        let mut inner = self.inner.write().await;
        let size = inner.file.len();
-        inner.resource_units.publish_size(size)
+        inner.resource_units.publish_size(size);
    }

    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1759,18 +1759,6 @@ impl ResidentLayer {
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.owner.metadata()
    }
-
-    #[cfg(test)]
-    pub(crate) async fn get_inner_delta<'a>(
-        &'a self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
-        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            LayerKind::Delta(d) => Ok(d),
-            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
-        }
-    }
 }

 impl AsLayerDesc for ResidentLayer {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,7 +9,6 @@ pub mod uninit;
 mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
@@ -20,7 +19,7 @@ use pageserver_api::{
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
-        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
+        EvictionPolicy, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
@@ -119,11 +118,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::config::TenantConf;
+use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -184,7 +183,7 @@ pub(crate) struct AuxFilesState {

 pub struct Timeline {
    conf: &'static PageServerConf,
-    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
+    tenant_conf: Arc<RwLock<AttachedTenantConf>>,

    myself: Weak<Self>,

@@ -310,8 +309,6 @@ pub struct Timeline {
    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,

-    last_image_layer_creation_check_at: AtomicLsn,
-
    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,

@@ -613,25 +610,6 @@ pub enum GetVectoredImpl {
    Vectored,
 }

-pub(crate) enum WaitLsnWaiter<'a> {
-    Timeline(&'a Timeline),
-    Tenant,
-    PageService,
-}
-
-/// Argument to [`Timeline::shutdown`].
-#[derive(Debug, Clone, Copy)]
-pub(crate) enum ShutdownMode {
-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
-    ///
-    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
-    /// the call to [`Timeline::shutdown`].
-    FreezeAndFlush,
-    /// Shut down immediately, without waiting for any open layers to flush.
-    Hard,
-}
-
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -1080,8 +1058,7 @@ impl Timeline {
    pub(crate) async fn wait_lsn(
        &self,
        lsn: Lsn,
-        who_is_waiting: WaitLsnWaiter<'_>,
-        ctx: &RequestContext, /* Prepare for use by cancellation */
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
        if self.cancel.is_cancelled() {
            return Err(WaitLsnError::Shutdown);
@@ -1089,28 +1066,20 @@ impl Timeline {
            return Err(WaitLsnError::BadState);
        }

-        if cfg!(debug_assertions) {
-            match ctx.task_kind() {
-                TaskKind::WalReceiverManager
-                | TaskKind::WalReceiverConnectionHandler
-                | TaskKind::WalReceiverConnectionPoller => {
-                    let is_myself = match who_is_waiting {
-                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
-                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
-                    };
-                    if is_myself {
-                        if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
-                            // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
-                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
-                        }
-                    } else {
-                        // if another  timeline's  is waiting for us, there's no deadlock risk because
-                        // our walreceiver task can make progress independent of theirs
-                    }
-                }
-                _ => {}
-            }
-        }
+        // This should never be called from the WAL receiver, because that could lead
+        // to a deadlock.
+        debug_assert!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        debug_assert!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        debug_assert!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
+            "wait_lsn cannot be called in WAL receiver"
+        );

        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();

@@ -1173,79 +1142,6 @@ impl Timeline {
        self.flush_frozen_layers_and_wait().await
    }

-    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
-    ///
-    /// This is for use in background housekeeping, to provide guarantees of layers closing eventually
-    /// even if there are no ongoing writes to drive that.
-    async fn maybe_freeze_ephemeral_layer(&self) {
-        let Ok(_write_guard) = self.write_lock.try_lock() else {
-            // If the write lock is held, there is an active wal receiver: rolling open layers
-            // is their responsibility while they hold this lock.
-            return;
-        };
-
-        let Ok(layers_guard) = self.layers.try_read() else {
-            // Don't block if the layer lock is busy
-            return;
-        };
-
-        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
-            // No open layer, no work to do.
-            return;
-        };
-
-        let Some(current_size) = open_layer.try_len() else {
-            // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
-            // read lock to get size should always succeed.
-            tracing::warn!("Lock conflict while reading size of open layer");
-            return;
-        };
-
-        let current_lsn = self.get_last_record_lsn();
-
-        let checkpoint_distance_override = open_layer.tick().await;
-
-        if let Some(size_override) = checkpoint_distance_override {
-            if current_size > size_override {
-                // This is not harmful, but it only happens in relatively rare cases where
-                // time-based checkpoints are not happening fast enough to keep the amount of
-                // ephemeral data within configured limits.  It's a sign of stress on the system.
-                tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
-            }
-        }
-
-        let checkpoint_distance =
-            checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
-
-        if self.should_roll(
-            current_size,
-            current_size,
-            checkpoint_distance,
-            self.get_last_record_lsn(),
-            self.last_freeze_at.load(),
-            *self.last_freeze_ts.read().unwrap(),
-        ) {
-            match open_layer.info() {
-                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
-                    // We may reach this point if the layer was already frozen by not yet flushed: flushing
-                    // happens asynchronously in the background.
-                    tracing::debug!(
-                        "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
-                    );
-                }
-                InMemoryLayerInfo::Open { .. } => {
-                    // Upgrade to a write lock and freeze the layer
-                    drop(layers_guard);
-                    let mut layers_guard = self.layers.write().await;
-                    layers_guard
-                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
-                        .await;
-                }
-            }
-            self.flush_frozen_layers();
-        }
-    }
-
    /// Outermost timeline compaction operation; downloads needed layers.
    pub(crate) async fn compact(
        self: &Arc<Self>,
@@ -1268,11 +1164,6 @@ impl Timeline {
            (guard, permit)
        };

-        // Prior to compaction, check if an open ephemeral layer should be closed: this provides
-        // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
-        // an ephemeral layer open forever when idle.
-        self.maybe_freeze_ephemeral_layer().await;
-
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
@@ -1305,7 +1196,6 @@ impl Timeline {

    pub(crate) fn activate(
        self: &Arc<Self>,
-        parent: Arc<crate::tenant::Tenant>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
@@ -1316,122 +1206,86 @@ impl Timeline {
        }
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
-        self.launch_eviction_task(parent, background_jobs_can_start);
+        self.launch_eviction_task(background_jobs_can_start);
    }

-    /// After this function returns, there are no timeline-scoped tasks are left running.
+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
    ///
-    /// The preferred pattern for is:
-    /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
-    /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
-    ///   go the extra mile and keep track of JoinHandles
-    /// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
-    ///   instead of spawning directly on a runtime. It is a more composable / testable pattern.
-    ///
-    /// For legacy reasons, we still have multiple tasks spawned using
-    /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
-    /// We refer to these as "timeline-scoped task_mgr tasks".
-    /// Some of these tasks are already sensitive to Timeline::cancel while others are
-    /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
-    /// or [`task_mgr::shutdown_watcher`].
-    /// We want to gradually convert the code base away from these.
-    ///
-    /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
-    /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
-    /// ones that aren't mentioned here):
-    /// - [`TaskKind::TimelineDeletionWorker`]
-    ///    - NB: also used for tenant deletion
-    /// - [`TaskKind::RemoteUploadTask`]`
-    /// - [`TaskKind::InitialLogicalSizeCalculation`]
-    /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
-    // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
-    /// - [`TaskKind::Eviction`]
-    /// - [`TaskKind::LayerFlushTask`]
-    /// - [`TaskKind::OndemandLogicalSizeCalculation`]
-    /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
-    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
+    /// While we are flushing, we continue to accept read I/O.
+    pub(crate) async fn flush_and_shutdown(&self) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let try_freeze_and_flush = match mode {
-            ShutdownMode::FreezeAndFlush => true,
-            ShutdownMode::Hard => false,
-        };
+        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
+        // trying to flush
+        tracing::debug!("Waiting for WalReceiverManager...");
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;

-        // Regardless of whether we're going to try_freeze_and_flush
-        // or not, stop ingesting any more data. Walreceiver only provides
-        // cancellation but no "wait until gone", because it uses the Timeline::gate.
-        // So, only after the self.gate.close() below will we know for sure that
-        // no walreceiver tasks are left.
-        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
-        // data during the call to `self.freeze_and_flush()` below.
-        // That's not ideal, but, we don't have the concept of a ChildGuard,
-        // which is what we'd need to properly model early shutdown of the walreceiver
-        // task sub-tree before the other Timeline task sub-trees.
-        let walreceiver = self.walreceiver.lock().unwrap().take();
-        tracing::debug!(
-            is_some = walreceiver.is_some(),
-            "Waiting for WalReceiverManager..."
-        );
-        if let Some(walreceiver) = walreceiver {
-            walreceiver.cancel();
-        }
-        // ... and inform any waiters for newer LSNs that there won't be any.
+        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
        self.last_record_lsn.shutdown();

-        if try_freeze_and_flush {
-            // we shut down walreceiver above, so, we won't add anything more
-            // to the InMemoryLayer; freeze it and wait for all frozen layers
-            // to reach the disk & upload queue, then shut the upload queue and
-            // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    if let Some(client) = self.remote_client.as_ref() {
-                        // if we did not wait for completion here, it might be our shutdown process
-                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                        // be spawned.
-                        //
-                        // what is problematic is the shutting down of RemoteTimelineClient, because
-                        // obviously it does not make sense to stop while we wait for it, but what
-                        // about corner cases like s3 suddenly hanging up?
-                        client.shutdown().await;
-                    }
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
+        // now all writers to InMemory layer are gone, do the final flush if requested
+        match self.freeze_and_flush().await {
+            Ok(_) => {
+                // drain the upload queue
+                if let Some(client) = self.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    client.shutdown().await;
                }
            }
+            Err(e) => {
+                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                // we have some extra WAL replay to do next time the timeline starts.
+                warn!("failed to freeze and flush: {e:#}");
+            }
        }

+        self.shutdown().await;
+    }
+
+    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
+    /// the graceful [`Timeline::flush_and_shutdown`] function.
+    pub(crate) async fn shutdown(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Transition the remote_client into a state where it's only useful for timeline deletion.
-        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
+        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
+        // while doing so.
+        self.last_record_lsn.shutdown();
+
+        // Shut down the layer flush task before the remote client, as one depends on the other
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::LayerFlushTask),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
+        // case our caller wants to use that for a deletion
        if let Some(remote_client) = self.remote_client.as_ref() {
            remote_client.stop();
-            // As documented in remote_client.stop()'s doc comment, it's our responsibility
-            // to shut down the upload queue tasks.
-            // TODO: fix that, task management should be encapsulated inside remote_client.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::RemoteUploadTask),
-                Some(self.tenant_shard_id),
-                Some(self.timeline_id),
-            )
-            .await;
        }

-        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
        tracing::debug!("Waiting for tasks...");
+
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;

-        // Finally wait until any gate-holders are complete.
-        //
-        // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
-        // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
+        // Finally wait until any gate-holders are complete
        self.gate.close().await;

        self.metrics.shutdown();
@@ -1580,53 +1434,6 @@ impl Timeline {
            Err(EvictionError::Timeout) => Ok(Some(false)),
        }
    }
-
-    fn should_roll(
-        &self,
-        layer_size: u64,
-        projected_layer_size: u64,
-        checkpoint_distance: u64,
-        projected_lsn: Lsn,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> bool {
-        let distance = projected_lsn.widening_sub(last_freeze_at);
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                projected_lsn, layer_size, distance
-            );
-
-            true
-        } else if projected_layer_size >= checkpoint_distance {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                projected_lsn, layer_size, projected_layer_size
-            );
-
-            true
-        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                projected_lsn,
-                layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            true
-        } else {
-            false
-        }
-    }
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -1635,65 +1442,57 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .lazy_slru_download
            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
    }

    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = &self.tenant_conf.load();
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
-            .tenant_conf
            .compaction_algorithm
            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
    }
@@ -1707,26 +1506,14 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

-    fn get_image_layer_creation_check_threshold(&self) -> u8 {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .image_layer_creation_check_threshold
-            .unwrap_or(
-                self.conf
-                    .default_tenant_conf
-                    .image_layer_creation_check_threshold,
-            )
-    }
-
-    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+    pub(super) fn tenant_conf_updated(&self) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.

        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                new_conf,
+                &self.tenant_conf.read().unwrap().tenant_conf,
                &self.conf.default_tenant_conf,
            );

@@ -1753,7 +1540,7 @@ impl Timeline {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn new(
        conf: &'static PageServerConf,
-        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
+        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
@@ -1772,13 +1559,14 @@ impl Timeline {
        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

-        let evictions_low_residence_duration_metric_threshold = {
-            let loaded_tenant_conf = tenant_conf.load();
+        let tenant_conf_guard = tenant_conf.read().unwrap();
+
+        let evictions_low_residence_duration_metric_threshold =
            Self::get_evictions_low_residence_duration_metric_threshold(
-                &loaded_tenant_conf.tenant_conf,
+                &tenant_conf_guard.tenant_conf,
                &conf.default_tenant_conf,
-            )
-        };
+            );
+        drop(tenant_conf_guard);

        Arc::new_cyclic(|myself| {
            let mut result = Timeline {
@@ -1855,7 +1643,6 @@ impl Timeline {
                },
                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,
-                last_image_layer_creation_check_at: AtomicLsn::new(0),

                last_received_wal: Mutex::new(None),
                rel_size_cache: RwLock::new(HashMap::new()),
@@ -1884,7 +1671,6 @@ impl Timeline {
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
-
            result
                .metrics
                .last_record_gauge
@@ -1961,19 +1747,20 @@ impl Timeline {
            self.timeline_id, self.tenant_shard_id
        );

-        let tenant_conf = self.tenant_conf.load();
-        let wal_connect_timeout = tenant_conf
+        let tenant_conf_guard = self.tenant_conf.read().unwrap();
+        let wal_connect_timeout = tenant_conf_guard
            .tenant_conf
            .walreceiver_connect_timeout
            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf
+        let lagging_wal_timeout = tenant_conf_guard
            .tenant_conf
            .lagging_wal_timeout
            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf
+        let max_lsn_wal_lag = tenant_conf_guard
            .tenant_conf
            .max_lsn_wal_lag
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
+        drop(tenant_conf_guard);

        let mut guard = self.walreceiver.lock().unwrap();
        assert!(
@@ -2521,6 +2308,10 @@ impl Timeline {
                debug!("cancelling logical size calculation for timeline shutdown");
                calculation.await
            }
+            _ = task_mgr::shutdown_watcher() => {
+                debug!("cancelling logical size calculation for task shutdown");
+                calculation.await
+            }
        }
    }

@@ -2796,10 +2587,6 @@ impl Timeline {
                    // Get all the data needed to reconstruct the page version from this layer.
                    // But if we have an older cached page image, no need to go past that.
                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let open_layer = open_layer.clone();
-                    drop(guard);
-
                    result = match open_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2817,7 +2604,10 @@ impl Timeline {
                    traversal_path.push((
                        result,
                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
+                        Box::new({
+                            let open_layer = Arc::clone(open_layer);
+                            move || open_layer.traversal_id()
+                        }),
                    ));
                    continue 'outer;
                }
@@ -2827,10 +2617,6 @@ impl Timeline {
                if cont_lsn > start_lsn {
                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let frozen_layer = frozen_layer.clone();
-                    drop(guard);
-
                    result = match frozen_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2848,7 +2634,10 @@ impl Timeline {
                    traversal_path.push((
                        result,
                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
+                        Box::new({
+                            let frozen_layer = Arc::clone(frozen_layer);
+                            move || frozen_layer.traversal_id()
+                        }),
                    ));
                    continue 'outer;
                }
@@ -2856,8 +2645,6 @@ impl Timeline {

            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                let layer = guard.get_from_desc(&layer);
-                drop(guard);
-
                // Get all the data needed to reconstruct the page version from this layer.
                // But if we have an older cached page image, no need to go past that.
                let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -2975,6 +2762,16 @@ impl Timeline {

        let mut completed_keyspace = KeySpace::default();

+        // Hold the layer map whilst visiting the timeline to prevent
+        // compaction, eviction and flushes from rendering the layers unreadable.
+        //
+        // TODO: Do we actually need to do this? In theory holding on
+        // to [`tenant::storage_layer::Layer`] should be enough. However,
+        // [`Timeline::get`] also holds the lock during IO, so more investigation
+        // is needed.
+        let guard = timeline.layers.read().await;
+        let layers = guard.layer_map();
+
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -2984,9 +2781,6 @@ impl Timeline {
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);

-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
-
            let in_memory_layer = layers.find_in_memory_layer(|l| {
                let start_lsn = l.get_lsn_range().start;
                cont_lsn > start_lsn
@@ -2994,11 +2788,12 @@ impl Timeline {

            match in_memory_layer {
                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
+                        ReadableLayerDesc::InMemory {
+                            handle: l,
+                            lsn_ceil: cont_lsn,
+                        },
                        unmapped_keyspace.clone(),
-                        lsn_range,
                    );
                }
                None => {
@@ -3010,43 +2805,30 @@ impl Timeline {
                            .into_iter()
                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                    ReadableLayerDesc::Persistent {
+                                        desc: (*layer).clone(),
+                                        lsn_range: lsn_floor..cont_lsn,
+                                    },
                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
                                )
                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
                    }
                }
            }

-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
-
-            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
-                let next_cont_lsn = lsn_range.start;
+            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
                layer_to_read
                    .get_values_reconstruct_data(
+                        &guard,
                        keyspace_to_read.clone(),
-                        lsn_range,
                        reconstruct_state,
                        ctx,
                    )
                    .await?;

                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = next_cont_lsn;
+                cont_lsn = layer_to_read.get_lsn_floor();
            } else {
                break;
            }
@@ -3124,7 +2906,7 @@ impl Timeline {
            }
        }
        ancestor
-            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
+            .wait_lsn(self.ancestor_lsn, ctx)
            .await
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -3204,11 +2986,16 @@ impl Timeline {
        loop {
            tokio::select! {
                _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task due to Timeline::cancel");
+                    info!("shutting down layer flush task");
+                    break;
+                },
+                _ = task_mgr::shutdown_watcher() => {
+                    info!("shutting down layer flush task");
                    break;
                },
                _ = layer_flush_start_rx.changed() => {}
            }
+
            trace!("waking up");
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
@@ -3584,24 +3371,6 @@ impl Timeline {

    // Is it time to create a new image layer for the given partition?
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
-        let last = self.last_image_layer_creation_check_at.load();
-        if lsn != Lsn(0) {
-            let distance = lsn
-                .checked_sub(last)
-                .expect("Attempt to compact with LSN going backwards");
-
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting below if we've not ingested
-            // sufficient WAL since the last check.
-            if distance.0 < min_distance {
-                return false;
-            }
-        }
-
-        self.last_image_layer_creation_check_at.store(lsn);
-
        let threshold = self.get_image_creation_threshold();

        let guard = self.layers.read().await;
@@ -3943,24 +3712,6 @@ impl Timeline {
        Ok(())
    }

-    /// Schedules the uploads of the given image layers
-    fn upload_new_image_layers(
-        self: &Arc<Self>,
-        new_images: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<()> {
-        let Some(remote_client) = &self.remote_client else {
-            return Ok(());
-        };
-        for layer in new_images {
-            remote_client.schedule_layer_file_upload(layer)?;
-        }
-        // should any new image layer been created, not uploading index_part will
-        // result in a mismatch between remote_physical_size and layermap calculated
-        // size, which will fail some tests, but should not be an issue otherwise.
-        remote_client.schedule_index_upload_for_file_changes()?;
-        Ok(())
-    }
-
    /// Update information about which layer files need to be retained on
    /// garbage collection. This is separate from actually performing the GC,
    /// and is updated more frequently, so that compaction can remove obsolete
@@ -4700,6 +4451,52 @@ impl<'a> TimelineWriter<'a> {
        res
    }

+    /// "Tick" the timeline writer: it will roll the open layer if required
+    /// and do nothing else.
+    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
+        self.open_layer_if_present().await?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let action = self.get_open_layer_action(last_record_lsn, 0);
+        if action == OpenLayerAction::Roll {
+            self.roll_layer(last_record_lsn).await?;
+        } else if let Some(writer_state) = &mut *self.write_guard {
+            // Periodic update of statistics
+            writer_state.open_layer.tick().await;
+        }
+
+        Ok(())
+    }
+
+    /// Populate the timeline writer state only if an in-memory layer
+    /// is already open.
+    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_none());
+
+        let open_layer = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            match layers.open_layer {
+                Some(ref open_layer) => open_layer.clone(),
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        let initial_size = open_layer.size().await?;
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            open_layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -4771,14 +4568,43 @@ impl<'a> TimelineWriter<'a> {
            return OpenLayerAction::None;
        }

-        if self.tl.should_roll(
-            state.current_size,
-            state.current_size + new_value_size,
-            self.get_checkpoint_distance(),
-            lsn,
-            state.cached_last_freeze_at,
-            state.cached_last_freeze_ts,
-        ) {
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
            OpenLayerAction::Roll
        } else {
            OpenLayerAction::None
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -125,8 +125,18 @@ impl Timeline {
                    )
                    .await
                    .map_err(anyhow::Error::from)?;
+                if let Some(remote_client) = &self.remote_client {
+                    for layer in layers {
+                        remote_client.schedule_layer_file_upload(layer)?;
+                    }
+                }

-                self.upload_new_image_layers(layers)?;
+                if let Some(remote_client) = &self.remote_client {
+                    // should any new image layer been created, not uploading index_part will
+                    // result in a mismatch between remote_physical_size and layermap calculated
+                    // size, which will fail some tests, but should not be an issue otherwise.
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -808,10 +818,7 @@ impl TimelineAdaptor {
        self.timeline
            .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
            .await?;
-
-        self.timeline
-            .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
-
+        self.new_images.clear();
        self.new_deltas.clear();
        self.layers_to_delete.clear();
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, instrument, Instrument};
+use tracing::{debug, error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};

 use crate::{
@@ -14,6 +14,7 @@ use crate::{
    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
+        debug_assert_current_span_has_tenant_and_timeline_id,
        metadata::TimelineMetadata,
        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
        CreateTimelineCause, DeleteTimelineError, Tenant,
@@ -22,6 +23,58 @@ use crate::{

 use super::{Timeline, TimelineResources};

+/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+    // Notify any timeline work to drop out of loops/requests
+    tracing::debug!("Cancelling CancellationToken");
+    timeline.cancel.cancel();
+
+    // Stop the walreceiver first.
+    debug!("waiting for wal receiver to shutdown");
+    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+    if let Some(walreceiver) = maybe_started_walreceiver {
+        walreceiver.stop().await;
+    }
+    debug!("wal receiver shutdown confirmed");
+
+    // Shut down the layer flush task before the remote client, as one depends on the other
+    task_mgr::shutdown_tasks(
+        Some(TaskKind::LayerFlushTask),
+        Some(timeline.tenant_shard_id),
+        Some(timeline.timeline_id),
+    )
+    .await;
+
+    // Prevent new uploads from starting.
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        remote_client.stop();
+    }
+
+    // Stop & wait for the remaining timeline tasks, including upload tasks.
+    // NB: This and other delete_timeline calls do not run as a task_mgr task,
+    //     so, they are not affected by this shutdown_tasks() call.
+    info!("waiting for timeline tasks to shutdown");
+    task_mgr::shutdown_tasks(
+        None,
+        Some(timeline.tenant_shard_id),
+        Some(timeline.timeline_id),
+    )
+    .await;
+
+    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-index-deleted-at"
+        ))?
+    });
+
+    tracing::debug!("Waiting for gate...");
+    timeline.gate.close().await;
+    tracing::debug!("Shutdown complete");
+
+    Ok(())
+}
+
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
@@ -215,14 +268,7 @@ impl DeleteTimelineFlow {

        guard.mark_in_progress()?;

-        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-        timeline.shutdown(super::ShutdownMode::Hard).await;
-
-        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-index-deleted-at"
-            ))?
-        });
+        stop_tasks(&timeline).await?;

        set_deleted_in_remote_index(&timeline).await?;

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -51,7 +51,6 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
    pub(super) fn launch_eviction_task(
        self: &Arc<Self>,
-        parent: Arc<Tenant>,
        background_tasks_can_start: Option<&completion::Barrier>,
    ) {
        let self_clone = Arc::clone(self);
@@ -67,19 +66,20 @@ impl Timeline {
            ),
            false,
            async move {
+                let cancel = task_mgr::shutdown_token();
                tokio::select! {
-                    _ = self_clone.cancel.cancelled() => { return Ok(()); }
+                    _ = cancel.cancelled() => { return Ok(()); }
                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                };

-                self_clone.eviction_task(parent).await;
+                self_clone.eviction_task(cancel).await;
                Ok(())
            },
        );
    }

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
+    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
        use crate::tenant::tasks::random_init_delay;

        // acquire the gate guard only once within a useful span
@@ -94,7 +94,7 @@ impl Timeline {
                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
-            if random_init_delay(period, &self.cancel).await.is_err() {
+            if random_init_delay(period, &cancel).await.is_err() {
                return;
            }
        }
@@ -103,13 +103,13 @@ impl Timeline {
        loop {
            let policy = self.get_eviction_policy();
            let cf = self
-                .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
+                .eviction_iteration(&policy, &cancel, &guard, &ctx)
                .await;

            match cf {
                ControlFlow::Break(()) => break,
                ControlFlow::Continue(sleep_until) => {
-                    if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
+                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
                        .await
                        .is_ok()
                    {
@@ -123,7 +123,6 @@ impl Timeline {
    #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
    async fn eviction_iteration(
        self: &Arc<Self>,
-        tenant: &Tenant,
        policy: &EvictionPolicy,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -138,7 +137,7 @@ impl Timeline {
            }
            EvictionPolicy::LayerAccessThreshold(p) => {
                match self
-                    .eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
+                    .eviction_iteration_threshold(p, cancel, gate, ctx)
                    .await
                {
                    ControlFlow::Break(()) => return ControlFlow::Break(()),
@@ -147,11 +146,7 @@ impl Timeline {
                (p.period, p.threshold)
            }
            EvictionPolicy::OnlyImitiate(p) => {
-                if self
-                    .imitiate_only(tenant, p, cancel, gate, ctx)
-                    .await
-                    .is_break()
-                {
+                if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
                    return ControlFlow::Break(());
                }
                (p.period, p.threshold)
@@ -180,7 +175,6 @@ impl Timeline {

    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
-        tenant: &Tenant,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -199,10 +193,7 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        match self
-            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
-        {
+        match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
            ControlFlow::Break(()) => return ControlFlow::Break(()),
            ControlFlow::Continue(()) => (),
        }
@@ -324,7 +315,6 @@ impl Timeline {
    /// disk usage based eviction task.
    async fn imitiate_only(
        self: &Arc<Self>,
-        tenant: &Tenant,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -341,8 +331,7 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
+        self.imitate_layer_accesses(p, cancel, gate, ctx).await
    }

    /// If we evict layers but keep cached values derived from those layers, then
@@ -372,7 +361,6 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
-        tenant: &Tenant,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -408,11 +396,17 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
+            Ok(t) => t,
+            Err(_) => {
+                return ControlFlow::Break(());
+            }
+        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
+                self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now());
            }
@@ -486,7 +480,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_synthetic_size_calculation_worker(
        &self,
-        tenant: &Tenant,
+        tenant: &Arc<Tenant>,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) {
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -86,7 +86,6 @@ impl<'t> UninitializedTimeline<'t> {
    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
        self,
-        tenant: Arc<Tenant>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
@@ -115,7 +114,7 @@ impl<'t> UninitializedTimeline<'t> {

        // All the data has been imported. Insert the Timeline into the tenant's timelines map
        let tl = self.finish_creation()?;
-        tl.activate(tenant, broker_client, None, ctx);
+        tl.activate(broker_client, None, ctx);
        Ok(tl)
    }

--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,21 +24,26 @@ mod connection_manager;
 mod walreceiver_connection;

 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };

+use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
+use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
+use tokio::select;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

+use utils::id::TimelineId;
+
 use self::connection_manager::ConnectionManagerStatus;

 use super::Timeline;
@@ -57,10 +62,9 @@ pub struct WalReceiverConf {
 }

 pub struct WalReceiver {
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
-    /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
-    /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
-    cancel: CancellationToken,
 }

 impl WalReceiver {
@@ -74,58 +78,65 @@ impl WalReceiver {
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
+
        let loop_status = Arc::new(std::sync::RwLock::new(None));
        let manager_status = Arc::clone(&loop_status);
-        let cancel = timeline.cancel.child_token();
-        WALRECEIVER_RUNTIME.spawn({
-            let cancel = cancel.clone();
+        task_mgr::spawn(
+            WALRECEIVER_RUNTIME.handle(),
+            TaskKind::WalReceiverManager,
+            Some(timeline.tenant_shard_id),
+            Some(timeline_id),
+            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
+            false,
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
-                // acquire timeline gate so we know the task doesn't outlive the Timeline
-                let Ok(_guard) = timeline.gate.enter() else {
-                    debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
-                    return;
-                };
                debug!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
                    conf,
-                    cancel.clone(),
                );
-                while !cancel.is_cancelled() {
-                    let loop_step_result = connection_manager_loop_step(
-                        &mut broker_client,
-                        &mut connection_manager_state,
-                        &walreceiver_ctx,
-                        &cancel,
-                        &loop_status,
-                    ).await;
-                    match loop_step_result {
-                        Ok(()) => continue,
-                        Err(_cancelled) => {
-                            trace!("Connection manager loop ended, shutting down");
+                loop {
+                    select! {
+                        _ = task_mgr::shutdown_watcher() => {
+                            trace!("WAL receiver shutdown requested, shutting down");
                            break;
-                        }
+                        },
+                        loop_step_result = connection_manager_loop_step(
+                            &mut broker_client,
+                            &mut connection_manager_state,
+                            &walreceiver_ctx,
+                            &loop_status,
+                        ) => match loop_step_result {
+                            ControlFlow::Continue(()) => continue,
+                            ControlFlow::Break(()) => {
+                                trace!("Connection manager loop ended, shutting down");
+                                break;
+                            }
+                        },
                    }
                }
+
                connection_manager_state.shutdown().await;
                *loop_status.write().unwrap() = None;
-                debug!("task exits");
+                Ok(())
            }
            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
-        });
+        );

        Self {
+            tenant_shard_id,
+            timeline_id,
            manager_status,
-            cancel,
        }
    }

-    #[instrument(skip_all, level = tracing::Level::DEBUG)]
-    pub fn cancel(&self) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        debug!("cancelling walreceiver tasks");
-        self.cancel.cancel();
+    pub async fn stop(self) {
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;
    }

    pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -159,18 +170,14 @@ enum TaskStateUpdate<E> {

 impl<E: Clone> TaskHandle<E> {
    /// Initializes the task, starting it immediately after the creation.
-    ///
-    /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
-    /// It being a child token enables us to provide a [`Self::shutdown`] method.
    fn spawn<Fut>(
-        cancel_parent: &CancellationToken,
        task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
    ) -> Self
    where
        Fut: Future<Output = anyhow::Result<()>> + Send,
        E: Send + Sync + 'static,
    {
-        let cancellation = cancel_parent.child_token();
+        let cancellation = CancellationToken::new();
        let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);

        let cancellation_clone = cancellation.clone();
@@ -190,9 +197,6 @@ impl<E: Clone> TaskHandle<E> {
        }
    }

-    /// # Cancel-Safety
-    ///
-    /// Cancellation-safe.
    async fn next_task_event(&mut self) -> TaskEvent<E> {
        match self.events_receiver.changed().await {
            Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::TaskKind;
+use crate::task_mgr::{shutdown_token, TaskKind};
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::{BrokerClientChannel, Code, Streaming};
-use tokio_util::sync::CancellationToken;
+use tokio::select;
 use tracing::*;

 use postgres_connection::PgConnectionConfig;
@@ -45,33 +45,27 @@ use super::{
    TaskEvent, TaskHandle,
 };

-pub(crate) struct Cancelled;
-
 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
 /// If storage broker subscription is cancelled, exits.
-///
-/// # Cancel-Safety
-///
-/// Not cancellation-safe. Use `cancel` token to request cancellation.
 pub(super) async fn connection_manager_loop_step(
    broker_client: &mut BrokerClientChannel,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
-    cancel: &CancellationToken,
    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
-) -> Result<(), Cancelled> {
-    match tokio::select! {
-        _ = cancel.cancelled() => { return Err(Cancelled); },
-        st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
-    } {
+) -> ControlFlow<(), ()> {
+    match connection_manager_state
+        .timeline
+        .wait_to_become_active(ctx)
+        .await
+    {
        Ok(()) => {}
        Err(new_state) => {
            debug!(
                ?new_state,
                "state changed, stopping wal connection manager loop"
            );
-            return Err(Cancelled);
+            return ControlFlow::Break(());
        }
    }

@@ -92,7 +86,7 @@ pub(super) async fn connection_manager_loop_step(
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
-    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
+    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
    debug!("Subscribed for broker timeline updates");

    loop {
@@ -100,7 +94,6 @@ pub(super) async fn connection_manager_loop_step(

        // These things are happening concurrently:
        //
-        // - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -108,11 +101,7 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
-
-        // NB: make sure each of the select expressions are cancellation-safe
-        // (no need for arms to be cancellation-safe).
-        tokio::select! {
-            _ = cancel.cancelled() => { return Err(Cancelled); }
+        select! {
            Some(wal_connection_update) = async {
                match connection_manager_state.wal_connection.as_mut() {
                    Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -144,7 +133,7 @@ pub(super) async fn connection_manager_loop_step(
            },

            // Got a new update from the broker
-            broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
+            broker_update = broker_subscription.message() => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                    Err(status) => {
@@ -158,17 +147,16 @@ pub(super) async fn connection_manager_loop_step(
                                warn!("broker subscription failed: {status}");
                            }
                        }
-                        return Ok(());
+                        return ControlFlow::Continue(());
                    }
                    Ok(None) => {
                        error!("broker subscription stream ended"); // can't happen
-                        return Ok(());
+                        return ControlFlow::Continue(());
                    }
                }
            },

            new_event = async {
-                // Reminder: this match arm needs to be cancellation-safe.
                loop {
                    if connection_manager_state.timeline.current_state() == TimelineState::Loading {
                        warn!("wal connection manager should only be launched after timeline has become active");
@@ -194,11 +182,11 @@ pub(super) async fn connection_manager_loop_step(
                }
            } => match new_event {
                ControlFlow::Continue(()) => {
-                    return Ok(());
+                    return ControlFlow::Continue(());
                }
                ControlFlow::Break(()) => {
                    debug!("Timeline is no longer active, stopping wal connection manager loop");
-                    return Err(Cancelled);
+                    return ControlFlow::Break(());
                }
            },

@@ -230,15 +218,16 @@ pub(super) async fn connection_manager_loop_step(
 async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
-    cancel: &CancellationToken,
-) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
+) -> Streaming<SafekeeperTimelineInfo> {
    let mut attempt = 0;
+    let cancel = shutdown_token();
+
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            cancel,
+            &cancel,
        )
        .await;
        attempt += 1;
@@ -252,14 +241,9 @@ async fn subscribe_for_timeline_updates(
            subscription_key: Some(key),
        };

-        match {
-            tokio::select! {
-                r = broker_client.subscribe_safekeeper_info(request) => { r }
-                _ = cancel.cancelled() => { return Err(Cancelled); }
-            }
-        } {
+        match broker_client.subscribe_safekeeper_info(request).await {
            Ok(resp) => {
-                return Ok(resp.into_inner());
+                return resp.into_inner();
            }
            Err(e) => {
                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
@@ -280,8 +264,6 @@ pub(super) struct ConnectionManagerState {
    id: TenantTimelineId,
    /// Use pageserver data about the timeline to filter out some of the safekeepers.
    timeline: Arc<Timeline>,
-    /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
-    cancel: CancellationToken,
    conf: WalReceiverConf,
    /// Current connection to safekeeper for WAL streaming.
    wal_connection: Option<WalConnection>,
@@ -404,11 +386,7 @@ struct BrokerSkTimeline {
 }

 impl ConnectionManagerState {
-    pub(super) fn new(
-        timeline: Arc<Timeline>,
-        conf: WalReceiverConf,
-        cancel: CancellationToken,
-    ) -> Self {
+    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
        let id = TenantTimelineId {
            tenant_id: timeline.tenant_shard_id.tenant_id,
            timeline_id: timeline.timeline_id,
@@ -416,7 +394,6 @@ impl ConnectionManagerState {
        Self {
            id,
            timeline,
-            cancel,
            conf,
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
@@ -424,22 +401,6 @@ impl ConnectionManagerState {
        }
    }

-    fn spawn<Fut>(
-        &self,
-        task: impl FnOnce(
-                tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
-                CancellationToken,
-            ) -> Fut
-            + Send
-            + 'static,
-    ) -> TaskHandle<WalConnectionStatus>
-    where
-        Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
-    {
-        // TODO: get rid of TaskHandle
-        super::TaskHandle::spawn(&self.cancel, task)
-    }
-
    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
        WALRECEIVER_SWITCHES
@@ -458,7 +419,7 @@ impl ConnectionManagerState {
        );

        let span = info_span!("connection", %node_id);
-        let connection_handle = self.spawn(move |events_sender, cancellation| {
+        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();

@@ -486,12 +447,6 @@ impl ConnectionManagerState {
                                info!("walreceiver connection handling ended: {e}");
                                Ok(())
                            }
-                            WalReceiverError::ClosedGate => {
-                                info!(
-                                    "walreceiver connection handling ended because of closed gate"
-                                );
-                                Ok(())
-                            }
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
                                if cancellation.is_cancelled() {
@@ -531,10 +486,6 @@ impl ConnectionManagerState {

    /// Drops the current connection (if any) and updates retry timeout for the next
    /// connection attempt to the same safekeeper.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Not cancellation-safe.
    async fn drop_old_connection(&mut self, needs_shutdown: bool) {
        let wal_connection = match self.wal_connection.take() {
            Some(wal_connection) => wal_connection,
@@ -542,14 +493,7 @@ impl ConnectionManagerState {
        };

        if needs_shutdown {
-            wal_connection
-                .connection_task
-                .shutdown()
-                // This here is why this function isn't cancellation-safe.
-                // If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
-                // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
-                // and thus be ineffective.
-                .await;
+            wal_connection.connection_task.shutdown().await;
        }

        let retry = self
@@ -894,9 +838,6 @@ impl ConnectionManagerState {
        }
    }

-    /// # Cancel-Safety
-    ///
-    /// Not cancellation-safe.
    pub(super) async fn shutdown(mut self) {
        if let Some(wal_connection) = self.wal_connection.take() {
            wal_connection.connection_task.shutdown().await;
@@ -1045,7 +986,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: state.spawn(move |sender, _| async move {
+            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1213,7 +1154,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: state.spawn(move |sender, _| async move {
+            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1280,7 +1221,7 @@ mod tests {
            sk_id: NodeId(1),
            availability_zone: None,
            status: connection_status,
-            connection_task: state.spawn(move |sender, _| async move {
+            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1344,7 +1285,7 @@ mod tests {
            sk_id: NodeId(1),
            availability_zone: None,
            status: connection_status,
-            connection_task: state.spawn(move |_, _| async move { Ok(()) }),
+            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
            discovered_new_wal: Some(NewCommittedWAL {
                discovered_at: time_over_threshold,
                lsn: new_lsn,
@@ -1400,7 +1341,6 @@ mod tests {
                timeline_id: TIMELINE_ID,
            },
            timeline,
-            cancel: CancellationToken::new(),
            conf: WalReceiverConf {
                wal_connect_timeout: Duration::from_secs(1),
                lagging_wal_timeout: Duration::from_secs(1),
@@ -1444,7 +1384,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: state.spawn(move |sender, _| async move {
+            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,6 +27,7 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -36,8 +37,8 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use utils::pageserver_feedback::PageserverFeedback;
 use utils::{id::NodeId, lsn::Lsn};
-use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -67,7 +68,6 @@ pub(super) enum WalReceiverError {
    SuccessfulCompletion(String),
    /// Generic error
    Other(anyhow::Error),
-    ClosedGate,
 }

 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,16 +119,6 @@ pub(super) async fn handle_walreceiver_connection(
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

-    // prevent timeline shutdown from finishing until we have exited
-    let _guard = timeline.gate.enter().map_err(|e| match e {
-        GateError::GateClosed => WalReceiverError::ClosedGate,
-    })?;
-    // This function spawns a side-car task (WalReceiverConnectionPoller).
-    // Get its gate guard now as well.
-    let poller_guard = timeline.gate.enter().map_err(|e| match e {
-        GateError::GateClosed => WalReceiverError::ClosedGate,
-    })?;
-
    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
@@ -166,19 +156,22 @@ pub(super) async fn handle_walreceiver_connection(
    }

    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own. It shouldn't outlive this function, but,
-    // due to lack of async drop, we can't enforce that. However, we ensure that
-    // 1. it is sensitive to `cancellation` and
-    // 2. holds the Timeline gate open so that after timeline shutdown,
-    //    we know this task is gone.
+    // so spawn it off to run on its own.
    let _connection_ctx = ctx.detached_child(
        TaskKind::WalReceiverConnectionPoller,
        ctx.download_behavior(),
    );
    let connection_cancellation = cancellation.clone();
-    WALRECEIVER_RUNTIME.spawn(
+    task_mgr::spawn(
+        WALRECEIVER_RUNTIME.handle(),
+        TaskKind::WalReceiverConnectionPoller,
+        Some(timeline.tenant_shard_id),
+        Some(timeline.timeline_id),
+        "walreceiver connection",
+        false,
        async move {
            debug_assert_current_span_has_tenant_and_timeline_id();
+
            select! {
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
@@ -189,9 +182,6 @@ pub(super) async fn handle_walreceiver_connection(
                                // with a similar error.
                            },
                            WalReceiverError::SuccessfulCompletion(_) => {}
-                            WalReceiverError::ClosedGate => {
-                                // doesn't happen at runtime
-                            }
                            WalReceiverError::Other(err) => {
                                warn!("Connection aborted: {err:#}")
                            }
@@ -200,7 +190,7 @@ pub(super) async fn handle_walreceiver_connection(
                },
                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
            }
-            drop(poller_guard);
+            Ok(())
        }
        // Enrich the log lines emitted by this closure with meaningful context.
        // TODO: technically, this task outlives the surrounding function, so, the
@@ -313,7 +303,6 @@ pub(super) async fn handle_walreceiver_connection(

                trace!("received XLogData between {startlsn} and {endlsn}");

-                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                waldecoder.feed_bytes(data);

                {
@@ -400,6 +389,17 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

+        {
+            // This is a hack. It piggybacks on the keepalive messages sent by the
+            // safekeeper in order to enforce `checkpoint_timeout` on the currently
+            // open layer. This hack doesn't provide a bound on the total size of
+            // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
+            let mut writer = timeline.writer().await;
+            if let Err(err) = writer.tick().await {
+                warn!("Timeline writer tick failed: {err}");
+            }
+        }
+
        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn = timeline
                .get_remote_consistent_lsn_visible()
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,7 +61,7 @@ pub struct VectoredRead {
 }

 impl VectoredRead {
-    pub fn size(&self) -> usize {
+    fn size(&self) -> usize {
        (self.end - self.start) as usize
    }
 }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,7 +111,6 @@ static PageServer page_servers[MAX_SHARDS];

 static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
-static void pageserver_disconnect_shard(shardno_t shard_no);

 static bool
 PagestoreShmemIsValid(void)
@@ -488,32 +487,9 @@ retry:
 	return ret;
 }

-/*
- * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch.
- */
+
 static void
 pageserver_disconnect(shardno_t shard_no)
-{
-	/*
-	 * If the connection to any pageserver is lost, we throw away the
-	 * whole prefetch queue, even for other pageservers. It should not
-	 * cause big problems, because connection loss is supposed to be a
-	 * rare event.
-	 *
-	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
-	 * because prefetch request may be registered before connection is established.
-	 */
-	prefetch_on_ps_disconnect();
-
-	pageserver_disconnect_shard(shard_no);
-}
-
-/*
- * Disconnect from specified shard
- */
-static void
-pageserver_disconnect_shard(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -527,6 +503,14 @@ pageserver_disconnect_shard(shardno_t shard_no)
 		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
 		PQfinish(page_servers[shard_no].conn);
 		page_servers[shard_no].conn = NULL;
+
+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
+		prefetch_on_ps_disconnect();
 	}
 	if (page_servers[shard_no].wes != NULL)
 	{
@@ -692,8 +676,7 @@ page_server_api api =
 {
 	.send = pageserver_send,
 	.flush = pageserver_flush,
-	.receive = pageserver_receive,
-	.disconnect = pageserver_disconnect_shard
+	.receive = pageserver_receive
 };

 static bool
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -180,7 +180,6 @@ typedef struct
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
 	NeonResponse *(*receive) (shardno_t shard_no);
 	bool		(*flush) (shardno_t shard_no);
-	void        (*disconnect) (shardno_t shard_no);
 } page_server_api;

 extern void prefetch_on_ps_disconnect(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -613,14 +613,6 @@ prefetch_on_ps_disconnect(void)
 		Assert(slot->status == PRFS_REQUESTED);
 		Assert(slot->my_ring_index == ring_index);

-		/*
-		 * Drop connection to all shards which have prefetch requests.
-		 * It is not a problem to call disconnect multiple times on the same connection
-		 * because disconnect implementation in libpagestore.c will check if connection
-		 * is alive and do nothing of connection was already dropped.
-		 */
-		page_server->disconnect(slot->shard_no);
-
 		/* clean up the request */
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
@@ -641,12 +633,13 @@ prefetch_on_ps_disconnect(void)
 static inline void
 prefetch_set_unused(uint64 ring_index)
 {
-	PrefetchRequest *slot;
+	PrefetchRequest *slot = GetPrfSlot(ring_index);

 	if (ring_index < MyPState->ring_last)
 		return;					/* Should already be unused */

-	slot = GetPrfSlot(ring_index);
+	Assert(MyPState->ring_unused > ring_index);
+
 	if (slot->status == PRFS_UNUSED)
 		return;

@@ -805,8 +798,7 @@ Retry:
 			{
 				if (*force_lsn > slot->effective_request_lsn)
 				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
+					prefetch_wait_for(ring_index);
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -821,8 +813,7 @@ Retry:
 			{
 				if (*force_lsn != slot->effective_request_lsn)
 				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
+					prefetch_wait_for(ring_index);
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -888,8 +879,7 @@ Retry:
 			{
 				case PRFS_REQUESTED:
 					Assert(MyPState->ring_receive == cleanup_index);
-					if (!prefetch_wait_for(cleanup_index))
-						goto Retry;
+					prefetch_wait_for(cleanup_index);
 					prefetch_set_unused(cleanup_index);
 					break;
 				case PRFS_RECEIVED:
@@ -1690,7 +1680,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2142,7 +2132,6 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

 	if (entry != NULL)
@@ -2164,8 +2153,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
-				if (!prefetch_wait_for(slot->my_ring_index))
-					goto Retry;
+				prefetch_wait_for(slot->my_ring_index);
 			}
 			/* drop caches */
 			prefetch_set_unused(slot->my_ring_index);
@@ -2228,7 +2216,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2501,7 +2489,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

@@ -2556,7 +2544,7 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2861,7 +2849,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	pfree(resp);

--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -10,7 +10,6 @@ testing = []

 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 async-trait.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,8 +12,6 @@ use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
-use crate::intern::EndpointIdInt;
-use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
@@ -30,7 +28,7 @@ use crate::{
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::info;

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -176,52 +174,6 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
    }
 }

-impl AuthenticationConfig {
-    pub fn check_rate_limit(
-        &self,
-
-        ctx: &mut RequestMonitoring,
-        secret: AuthSecret,
-        endpoint: &EndpointId,
-        is_cleartext: bool,
-    ) -> auth::Result<AuthSecret> {
-        // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint);
-
-        // only count the full hash count if password hack or websocket flow.
-        // in other words, if proxy needs to run the hashing
-        let password_weight = if is_cleartext {
-            match &secret {
-                #[cfg(any(test, feature = "testing"))]
-                AuthSecret::Md5(_) => 1,
-                AuthSecret::Scram(s) => s.iterations + 1,
-            }
-        } else {
-            // validating scram takes just 1 hmac_sha_256 operation.
-            1
-        };
-
-        let limit_not_exceeded = self
-            .rate_limiter
-            .check((endpoint_int, ctx.peer_addr), password_weight);
-
-        if !limit_not_exceeded {
-            warn!(
-                enabled = self.rate_limiter_enabled,
-                "rate limiting authentication"
-            );
-            AUTH_RATE_LIMIT_HITS.inc();
-            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
-
-            if self.rate_limiter_enabled {
-                return Err(auth::AuthError::too_many_connections());
-            }
-        }
-
-        Ok(secret)
-    }
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -262,24 +214,14 @@ async fn auth_quirks(
        Some(secret) => secret,
        None => api.get_role_secret(ctx, &info).await?,
    };
-    let (cached_entry, secret) = cached_secret.take_value();
-
-    let secret = match secret {
-        Some(secret) => config.check_rate_limit(
-            ctx,
-            secret,
-            &info.endpoint,
-            unauthenticated_password.is_some() || allow_cleartext,
-        )?,
-        None => {
-            // If we don't have an authentication secret, we mock one to
-            // prevent malicious probing (possible due to missing protocol steps).
-            // This mocked secret will never lead to successful authentication.
-            info!("authentication info not found, mocking it");
-            AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
-        }
-    };

+    let secret = cached_secret.value.clone().unwrap_or_else(|| {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
+    });
    match authenticate_with_secret(
        ctx,
        secret,
@@ -295,7 +237,7 @@ async fn auth_quirks(
        Err(e) => {
            if e.is_auth_failed() {
                // The password could have been changed, so we invalidate the cache.
-                cached_entry.invalidate();
+                cached_secret.invalidate();
            }
            Err(e)
        }
@@ -473,7 +415,6 @@ mod tests {

    use bytes::BytesMut;
    use fallible_iterator::FallibleIterator;
-    use once_cell::sync::Lazy;
    use postgres_protocol::{
        authentication::sasl::{ChannelBinding, ScramSha256},
        message::{backend::Message as PgMessage, frontend},
@@ -491,7 +432,6 @@ mod tests {
        },
        context::RequestMonitoring,
        proxy::NeonOptions,
-        rate_limiter::{AuthRateLimiter, RateBucketInfo},
        scram::ServerSecret,
        stream::{PqStream, Stream},
    };
@@ -533,11 +473,9 @@ mod tests {
        }
    }

-    static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+    static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
        scram_protocol_timeout: std::time::Duration::from_secs(5),
-        rate_limiter_enabled: true,
-        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
-    });
+    };

    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
        loop {
@@ -606,7 +544,7 @@ mod tests {
            }
        });

-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
            .await
            .unwrap();

@@ -646,7 +584,7 @@ mod tests {
            client.write_all(&write).await.unwrap();
        });

-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
            .await
            .unwrap();

@@ -686,7 +624,7 @@ mod tests {
            client.write_all(&write).await.unwrap();
        });

-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
+        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
            .await
            .unwrap();

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -10,7 +10,6 @@ use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
-use proxy::config::remote_storage_from_toml;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -19,7 +18,6 @@ use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
 use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
-use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
@@ -143,16 +141,10 @@ struct ProxyCliArgs {
    ///
    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
    /// Can be given multiple times for different bucket sizes.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
    endpoint_rps_limit: Vec<RateBucketInfo>,
-    /// Whether the auth rate limiter actually takes effect (for testing)
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    auth_rate_limit_enabled: bool,
-    /// Authentication rate limiter max number of hashes per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
-    auth_rate_limit: Vec<RateBucketInfo>,
    /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
    #[clap(long, default_value_t = 100)]
@@ -192,19 +184,6 @@ struct ProxyCliArgs {

    #[clap(flatten)]
    parquet_upload: ParquetUploadArgs,
-
-    /// interval for backup metric collection
-    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
-    metric_backup_collection_interval: std::time::Duration,
-    /// remote storage configuration for backup metric collection
-    /// Encoded as toml (same format as pageservers), eg
-    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}")]
-    metric_backup_collection_remote_storage: String,
-    /// chunk size for backup metric collection
-    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
-    #[clap(long, default_value = "4194304")]
-    metric_backup_collection_chunk_size: usize,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -386,17 +365,12 @@ async fn main() -> anyhow::Result<()> {

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
-        // TODO: Add gc regardles of the metric collection being enabled.
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
-        client_tasks.spawn(usage_metrics::task_backup(
-            &metrics_config.backup_metric_collection_config,
-            cancellation_token,
-        ));
    }

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
@@ -453,13 +427,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    if args.allow_self_signed_compute {
        warn!("allowing self-signed compute certificates");
    }
-    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
-        interval: args.metric_backup_collection_interval,
-        remote_storage_config: remote_storage_from_toml(
-            &args.metric_backup_collection_remote_storage,
-        )?,
-        chunk_size: args.metric_backup_collection_chunk_size,
-    };

    let metric_collection = match (
        &args.metric_collection_endpoint,
@@ -468,7 +435,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
            endpoint: endpoint.parse()?,
            interval: humantime::parse_duration(interval)?,
-            backup_metric_collection_config,
        }),
        (None, None) => None,
        _ => bail!(
@@ -544,8 +510,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    };
    let authentication_config = AuthenticationConfig {
        scram_protocol_timeout: args.scram_protocol_timeout,
-        rate_limiter_enabled: args.auth_rate_limit_enabled,
-        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
    };

    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -43,16 +43,6 @@ impl<C: Cache, V> Cached<C, V> {
        Self { token: None, value }
    }

-    pub fn take_value(self) -> (Cached<C, ()>, V) {
-        (
-            Cached {
-                token: self.token,
-                value: (),
-            },
-            self.value,
-        )
-    }
-
    /// Drop this entry from a cache if it's still there.
    pub fn invalidate(self) -> V {
        if let Some((cache, info)) = &self.token {
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -373,7 +373,10 @@ mod tests {
        let endpoint_id = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
        let secret2 = None;
        let allowed_ips = Arc::new(vec![
            "127.0.0.1".parse().unwrap(),
@@ -392,7 +395,10 @@ mod tests {

        // Shouldn't add more than 2 roles.
        let user3: RoleName = "user3".into();
-        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
+        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user3.as_str(),
+            [3; 32],
+        )));
        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());

@@ -425,8 +431,14 @@ mod tests {
        let endpoint_id = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
-        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
        let allowed_ips = Arc::new(vec![
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
@@ -474,8 +486,14 @@ mod tests {
        let endpoint_id = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
-        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
        let allowed_ips = Arc::new(vec![
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,11 +1,6 @@
-use crate::{
-    auth,
-    rate_limiter::{AuthRateLimiter, RateBucketInfo},
-    serverless::GlobalConnPoolOptions,
-};
+use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
-use remote_storage::RemoteStorageConfig;
 use rustls::{
    crypto::ring::sign,
    pki_types::{CertificateDer, PrivateKeyDer},
@@ -40,7 +35,6 @@ pub struct ProxyConfig {
 pub struct MetricCollectionConfig {
    pub endpoint: reqwest::Url,
    pub interval: Duration,
-    pub backup_metric_collection_config: MetricBackupCollectionConfig,
 }

 pub struct TlsConfig {
@@ -56,8 +50,6 @@ pub struct HttpConfig {

 pub struct AuthenticationConfig {
    pub scram_protocol_timeout: tokio::time::Duration,
-    pub rate_limiter_enabled: bool,
-    pub rate_limiter: AuthRateLimiter,
 }

 impl TlsConfig {
@@ -313,21 +305,6 @@ impl CertResolver {
    }
 }

-#[derive(Debug)]
-pub struct MetricBackupCollectionConfig {
-    pub interval: Duration,
-    pub remote_storage_config: OptRemoteStorageConfig,
-    pub chunk_size: usize,
-}
-
-/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
-/// runtime type errors from the value parser we use.
-pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
-
-pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
-    RemoteStorageConfig::from_toml(&s.parse()?)
-}
-
 /// Helper for cmdline cache options parsing.
 #[derive(Debug)]
 pub struct CacheOptions {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,14 +13,12 @@ use parquet::{
    },
    record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;

-use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
-
 use super::{RequestMonitoring, LOG_CHAN};

 #[derive(clap::Args, Clone, Debug)]
@@ -52,13 +50,21 @@ pub struct ParquetUploadArgs {
    parquet_upload_compression: Compression,
 }

+/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
+/// runtime type errors from the value parser we use.
+type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
+
+fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+    RemoteStorageConfig::from_toml(&s.parse()?)
+}
+
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a upload fails, we log it at info-level, and retry.
 // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_UPLOAD_RETRIES times, we give up
-pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
-pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;

 // the parquet crate leaves a lot to be desired...
 // what follows is an attempt to write parquet files with minimal allocs.
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,10 +4,7 @@ use ::metrics::{
    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
    IntCounterVec, IntGauge, IntGaugeVec,
 };
-use metrics::{
-    register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
-    IntCounterPair,
-};
+use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};

 use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
@@ -117,15 +114,12 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
    .unwrap()
 });

-pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "proxy_http_conn_content_length_bytes",
-        "Number of bytes the HTTP response content consumes",
-        // request/response
-        &["direction"],
-        // smallest bucket = 16 bytes
-        // largest bucket = 4^12 * 16 bytes = 256MB
-        exponential_buckets(16.0, 4.0, 12).unwrap()
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(8.0, 2.0, 20).unwrap()
    )
    .unwrap()
 });
@@ -364,20 +358,3 @@ pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
    )
    .unwrap()
 });
-
-pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
-    register_hll!(
-        32,
-        "proxy_endpoints_auth_rate_limits",
-        "Number of endpoints affected by authentication rate limits",
-    )
-    .unwrap()
-});
-
-pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_requests_auth_rate_limits_total",
-        "Number of connection requests affected by authentication rate limits",
-    )
-    .unwrap()
-});
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(

    // check rate limit
    if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep, 1) {
+        if !endpoint_rate_limiter.check(ep) {
            return stream
                .throw_error(auth::AuthError::too_many_connections())
                .await?;
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -4,7 +4,7 @@ use crate::{
    console::messages::MetricsAuxInfo,
    metrics::NUM_BYTES_PROXIED_COUNTER,
    stream::Stream,
-    usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
+    usage_metrics::{Ids, USAGE_METRICS},
 };
 use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -142,8 +142,8 @@ impl Scram {
        Ok(Scram(secret))
    }

-    fn mock() -> Self {
-        Scram(scram::ServerSecret::mock(rand::random()))
+    fn mock(user: &str) -> Self {
+        Scram(scram::ServerSecret::mock(user, rand::random()))
    }
 }

@@ -330,7 +330,11 @@ async fn scram_auth_mock() -> anyhow::Result<()> {

    let (client_config, server_config) =
        generate_tls_config("generic-project-name.localhost", "localhost")?;
-    let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
+    let proxy = tokio::spawn(dummy_proxy(
+        client,
+        Some(server_config),
+        Scram::mock("user"),
+    ));

    use rand::{distributions::Alphanumeric, Rng};
    let password: String = rand::thread_rng()
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,8 +1,6 @@
 use std::{
-    borrow::Cow,
    collections::hash_map::RandomState,
-    hash::{BuildHasher, Hash},
-    net::IpAddr,
+    hash::BuildHasher,
    sync::{
        atomic::{AtomicUsize, Ordering},
        Arc, Mutex,
@@ -17,7 +15,7 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;

-use crate::{intern::EndpointIdInt, EndpointId};
+use crate::EndpointId;

 use super::{
    limit_algorithm::{LimitAlgorithm, Sample},
@@ -51,11 +49,11 @@ impl RedisRateLimiter {
            .data
            .iter_mut()
            .zip(self.info)
-            .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));

        if should_allow_request {
            // only increment the bucket counts if the request will actually be accepted
-            self.data.iter_mut().for_each(|b| b.inc(1));
+            self.data.iter_mut().for_each(RateBucket::inc);
        }

        should_allow_request
@@ -73,14 +71,9 @@ impl RedisRateLimiter {
 // saw SNI, before doing TLS handshake. User-side error messages in that case
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
-pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
-
-// This can't be just per IP because that would limit some PaaS that share IP addresses
-pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
-
-pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<Key, Vec<RateBucket>, Hasher>,
-    info: Cow<'static, [RateBucketInfo]>,
+pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
+    map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
+    info: &'static [RateBucketInfo],
    access_count: AtomicUsize,
    rand: Mutex<Rand>,
 }
@@ -92,9 +85,9 @@ struct RateBucket {
 }

 impl RateBucket {
-    fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool {
+    fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
        if now - self.start < info.interval {
-            self.count + n <= info.max_rpi
+            self.count < info.max_rpi
        } else {
            // bucket expired, reset
            self.count = 0;
@@ -104,8 +97,8 @@ impl RateBucket {
        }
    }

-    fn inc(&mut self, n: u32) {
-        self.count += n;
+    fn inc(&mut self) {
+        self.count += 1;
    }
 }

@@ -118,7 +111,7 @@ pub struct RateBucketInfo {

 impl std::fmt::Display for RateBucketInfo {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
+        let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
        write!(f, "{rps}@{}", humantime::format_duration(self.interval))
    }
 }
@@ -143,25 +136,12 @@ impl std::str::FromStr for RateBucketInfo {
 }

 impl RateBucketInfo {
-    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+    pub const DEFAULT_SET: [Self; 3] = [
        Self::new(300, Duration::from_secs(1)),
        Self::new(200, Duration::from_secs(60)),
        Self::new(100, Duration::from_secs(600)),
    ];

-    /// All of these are per endpoint-ip pair.
-    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
-    ///
-    /// First bucket: 300mcpus total per endpoint-ip pair
-    /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
-    /// * 300 requests per second with 4096 hash rounds.
-    /// * 2 requests per second with 600000 hash rounds.
-    pub const DEFAULT_AUTH_SET: [Self; 3] = [
-        Self::new(300 * 4096, Duration::from_secs(1)),
-        Self::new(200 * 4096, Duration::from_secs(60)),
-        Self::new(100 * 4096, Duration::from_secs(600)),
-    ];
-
    pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
        info.sort_unstable_by_key(|info| info.interval);
        let invalid = info
@@ -170,7 +150,7 @@ impl RateBucketInfo {
            .find(|(a, b)| a.max_rpi > b.max_rpi);
        if let Some((a, b)) = invalid {
            bail!(
-                "invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
+                "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
                b.max_rpi,
                a.max_rpi,
            );
@@ -182,24 +162,19 @@ impl RateBucketInfo {
    pub const fn new(max_rps: u32, interval: Duration) -> Self {
        Self {
            interval,
-            max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32,
+            max_rpi: max_rps * interval.as_millis() as u32 / 1000,
        }
    }
 }

-impl<K: Hash + Eq> BucketRateLimiter<K> {
-    pub fn new(info: impl Into<Cow<'static, [RateBucketInfo]>>) -> Self {
+impl EndpointRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
        Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
    }
 }

-impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
-    fn new_with_rand_and_hasher(
-        info: impl Into<Cow<'static, [RateBucketInfo]>>,
-        rand: R,
-        hasher: S,
-    ) -> Self {
-        let info = info.into();
+impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
+    fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
        info!(buckets = ?info, "endpoint rate limiter");
        Self {
            info,
@@ -210,7 +185,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
    }

    /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, key: K, n: u32) -> bool {
+    pub fn check(&self, endpoint: EndpointId) -> bool {
        // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
        // worst case memory usage is about:
        //    = 2 * 2048 * 64 * (48B + 72B)
@@ -220,7 +195,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
        }

        let now = Instant::now();
-        let mut entry = self.map.entry(key).or_insert_with(|| {
+        let mut entry = self.map.entry(endpoint).or_insert_with(|| {
            vec![
                RateBucket {
                    start: now,
@@ -232,12 +207,12 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {

        let should_allow_request = entry
            .iter_mut()
-            .zip(&*self.info)
-            .all(|(bucket, info)| bucket.should_allow_request(info, now, n));
+            .zip(self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));

        if should_allow_request {
            // only increment the bucket counts if the request will actually be accepted
-            entry.iter_mut().for_each(|b| b.inc(n));
+            entry.iter_mut().for_each(RateBucket::inc);
        }

        should_allow_request
@@ -248,7 +223,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
    /// But that way deletion does not aquire mutex on each entry access.
    pub fn do_gc(&self) {
        info!(
-            "cleaning up bucket rate limiter, current size = {}",
+            "cleaning up endpoint rate limiter, current size = {}",
            self.map.len()
        );
        let n = self.map.shards().len();
@@ -559,7 +534,7 @@ mod tests {
    use rustc_hash::FxHasher;
    use tokio::time;

-    use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
+    use super::{EndpointRateLimiter, Limiter, Outcome};
    use crate::{
        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
        EndpointId,
@@ -697,12 +672,12 @@ mod tests {

    #[test]
    fn default_rate_buckets() {
-        let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
        RateBucketInfo::validate(&mut defaults[..]).unwrap();
    }

    #[test]
-    #[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
+    #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
    fn rate_buckets_validate() {
        let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
            .into_iter()
@@ -718,42 +693,42 @@ mod tests {
            .map(|s| s.parse().unwrap())
            .collect();
        RateBucketInfo::validate(&mut rates).unwrap();
-        let limiter = EndpointRateLimiter::new(rates);
+        let limiter = EndpointRateLimiter::new(Vec::leak(rates));

        let endpoint = EndpointId::from("ep-my-endpoint-1234");

        time::pause();

        for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint.clone()));
        }
        // more connections fail
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint.clone()));

        // fail even after 500ms as it's in the same bucket
        time::advance(time::Duration::from_millis(500)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint.clone()));

        // after a full 1s, 100 requests are allowed again
        time::advance(time::Duration::from_millis(500)).await;
        for _ in 1..6 {
-            for _ in 0..50 {
-                assert!(limiter.check(endpoint.clone(), 2));
+            for _ in 0..100 {
+                assert!(limiter.check(endpoint.clone()));
            }
            time::advance(time::Duration::from_millis(1000)).await;
        }

        // more connections after 600 will exceed the 20rps@30s limit
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint.clone()));

        // will still fail before the 30 second limit
        time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint.clone()));

        // after the full 30 seconds, 100 requests are allowed again
        time::advance(time::Duration::from_millis(1)).await;
        for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint.clone()));
        }
    }

@@ -763,41 +738,14 @@ mod tests {
        let rand = rand::rngs::StdRng::from_seed([1; 32]);
        let hasher = BuildHasherDefault::<FxHasher>::default();

-        let limiter = BucketRateLimiter::new_with_rand_and_hasher(
-            &RateBucketInfo::DEFAULT_ENDPOINT_SET,
+        let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
+            &RateBucketInfo::DEFAULT_SET,
            rand,
            hasher,
        );
        for i in 0..1_000_000 {
-            limiter.check(i, 1);
+            limiter.check(format!("{i}").into());
        }
        assert!(limiter.map.len() < 150_000);
    }
-
-    #[test]
-    fn test_default_auth_set() {
-        // these values used to exceed u32::MAX
-        assert_eq!(
-            RateBucketInfo::DEFAULT_AUTH_SET,
-            [
-                RateBucketInfo {
-                    interval: Duration::from_secs(1),
-                    max_rpi: 300 * 4096,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(60),
-                    max_rpi: 200 * 4096 * 60,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(600),
-                    max_rpi: 100 * 4096 * 600,
-                }
-            ]
-        );
-
-        for x in RateBucketInfo::DEFAULT_AUTH_SET {
-            let y = x.to_string().parse().unwrap();
-            assert_eq!(x, y);
-        }
-    }
 }
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -50,13 +50,13 @@ impl ServerSecret {
    /// To avoid revealing information to an attacker, we use a
    /// mocked server secret even if the user doesn't exist.
    /// See `auth-scram.c : mock_scram_secret` for details.
-    pub fn mock(nonce: [u8; 32]) -> Self {
+    pub fn mock(user: &str, nonce: [u8; 32]) -> Self {
+        // Refer to `auth-scram.c : scram_mock_salt`.
+        let mocked_salt = super::sha256([user.as_bytes(), &nonce]);
+
        Self {
-            // this doesn't reveal much information as we're going to use
-            // iteration count 1 for our generated passwords going forward.
-            // PG16 users can set iteration count=1 already today.
-            iterations: 1,
-            salt_base64: base64::encode(nonce),
+            iterations: 4096,
+            salt_base64: base64::encode(mocked_salt),
            stored_key: ScramKey::default(),
            server_key: ScramKey::default(),
            doomed: true,
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -42,12 +42,7 @@ impl PoolingBackend {
        };

        let secret = match cached_secret.value.clone() {
-            Some(secret) => self.config.authentication_config.check_rate_limit(
-                ctx,
-                secret,
-                &user_info.endpoint,
-                true,
-            )?,
+            Some(secret) => secret,
            None => {
                // If we don't have an authentication secret, for the http flow we can just return an error.
                info!("authentication info not found");
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -42,15 +42,12 @@ use crate::error::ReportableError;
 use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
-use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
-use crate::usage_metrics::MetricCounterRecorder;
 use crate::DbName;
 use crate::RoleName;

 use super::backend::PoolingBackend;
-use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -222,7 +219,14 @@ pub async fn handle(
    backend: Arc<PoolingBackend>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    let cancel2 = cancel.clone();
+    let handle = tokio::spawn(async move {
+        time::sleep(config.http_config.request_timeout).await;
+        cancel2.cancel();
+    });
+
    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    handle.abort();

    let mut response = match result {
        Ok(r) => {
@@ -233,7 +237,10 @@ pub async fn handle(
            let error_kind = e.get_error_kind();
            ctx.set_error_kind(error_kind);

-            let message = "Query cancelled, connection was terminated";
+            let message = format!(
+                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
+                config.http_config.request_timeout.as_secs_f64()
+            );

            tracing::info!(
                kind=error_kind.to_metric_label(),
@@ -427,63 +434,6 @@ impl ReportableError for SqlOverHttpCancel {
    }
 }

-#[derive(Clone, Copy, Debug)]
-struct HttpHeaders {
-    raw_output: bool,
-    default_array_mode: bool,
-    txn_isolation_level: Option<IsolationLevel>,
-    txn_read_only: bool,
-    txn_deferrable: bool,
-}
-
-impl HttpHeaders {
-    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
-        // Determine the output options. Default behaviour is 'false'. Anything that is not
-        // strictly 'true' assumed to be false.
-        let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-        let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
-
-        // isolation level, read only and deferrable
-        let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) {
-            Some(x) => Some(
-                map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?,
-            ),
-            None => None,
-        };
-
-        let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
-        let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
-
-        Ok(Self {
-            raw_output,
-            default_array_mode,
-            txn_isolation_level,
-            txn_read_only,
-            txn_deferrable,
-        })
-    }
-}
-
-fn map_header_to_isolation_level(level: &HeaderValue) -> Option<IsolationLevel> {
-    match level.as_bytes() {
-        b"Serializable" => Some(IsolationLevel::Serializable),
-        b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted),
-        b"ReadCommitted" => Some(IsolationLevel::ReadCommitted),
-        b"RepeatableRead" => Some(IsolationLevel::RepeatableRead),
-        _ => None,
-    }
-}
-
-fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue> {
-    match level {
-        IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")),
-        IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")),
-        IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")),
-        IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")),
-        _ => None,
-    }
-}
-
 async fn handle_inner(
    cancel: CancellationToken,
    config: &'static ProxyConfig,
@@ -500,26 +450,43 @@ async fn handle_inner(
    // Determine the destination and connection params
    //
    let headers = request.headers();
-
    // TLS config should be there.
    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
    info!(user = conn_info.user_info.user.as_str(), "credentials");

+    // Determine the output options. Default behaviour is 'false'. Anything that is not
+    // strictly 'true' assumed to be false.
+    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
+    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+
    // Allow connection pooling only if explicitly requested
    // or if we have decided that http pool is no longer opt-in
    let allow_pool = !config.http_config.pool_options.opt_in
        || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

-    let parsed_headers = HttpHeaders::try_parse(headers)?;
+    // isolation level, read only and deferrable
+
+    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
+    let txn_isolation_level = match txn_isolation_level_raw {
+        Some(ref x) => Some(match x.as_bytes() {
+            b"Serializable" => IsolationLevel::Serializable,
+            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
+            b"ReadCommitted" => IsolationLevel::ReadCommitted,
+            b"RepeatableRead" => IsolationLevel::RepeatableRead,
+            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
+        }),
+        None => None,
+    };
+
+    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
+    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);

    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
    };
    info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["request"])
-        .observe(request_content_length as f64);
+    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);

    // we don't have a streaming request support yet so this is to prevent OOM
    // from a malicious user sending an extremely large request body
@@ -547,18 +514,20 @@ async fn handle_inner(
    }
    .map_err(SqlOverHttpError::from);

-    let (payload, mut client) = match run_until_cancelled(
-        // Run both operations in parallel
+    // Run both operations in parallel
+    let (payload, mut client) = match select(
        try_join(
            pin!(fetch_and_process_request),
            pin!(authenticate_and_connect),
        ),
-        &cancel,
+        pin!(cancel.cancelled()),
    )
    .await
    {
-        Some(result) => result?,
-        None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)),
+        Either::Left((result, _cancelled)) => result?,
+        Either::Right((_cancelled, _)) => {
+            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
+        }
    };

    let mut response = Response::builder()
@@ -568,143 +537,95 @@ async fn handle_inner(
    //
    // Now execute the query and return the result
    //
+    let mut size = 0;
    let result = match payload {
-        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
-        Payload::Batch(statements) => {
-            if parsed_headers.txn_read_only {
-                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
-            }
-            if parsed_headers.txn_deferrable {
-                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
-            }
-            if let Some(txn_isolation_level) = parsed_headers
-                .txn_isolation_level
-                .and_then(map_isolation_level_to_headers)
-            {
-                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-            }
-
-            statements
-                .process(cancel, &mut client, parsed_headers)
-                .await?
-        }
-    };
-
-    let metrics = client.metrics();
-
-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
-    let response = response
-        .body(Body::from(body))
-        // only fails if invalid status code or invalid header/values are given.
-        // these are not user configurable so it cannot fail dynamically
-        .expect("building response payload should not fail");
-
-    // count the egress bytes - we miss the TLS and header overhead but oh well...
-    // moving this later in the stack is going to be a lot of effort and ehhhh
-    metrics.record_egress(len as u64);
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["response"])
-        .observe(len as f64);
-
-    Ok(response)
-}
-
-impl QueryData {
-    async fn process(
-        self,
-        cancel: CancellationToken,
-        client: &mut Client<tokio_postgres::Client>,
-        parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
-        let (inner, mut discard) = client.inner();
-        let cancel_token = inner.cancel_token();
-
-        let res = match select(
-            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
-            pin!(cancel.cancelled()),
-        )
-        .await
-        {
-            // The query successfully completed.
-            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
-                discard.check_idle(status);
-                Ok(results)
-            }
-            // The query failed with an error
-            Either::Left((Err(e), __not_yet_cancelled)) => {
-                discard.discard();
-                return Err(e);
-            }
-            // The query was cancelled.
-            Either::Right((_cancelled, query)) => {
-                if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                    tracing::error!(?err, "could not cancel query");
+        Payload::Single(stmt) => {
+            let mut size = 0;
+            let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
+            let query = pin!(query_to_json(
+                &*inner,
+                stmt,
+                &mut size,
+                raw_output,
+                default_array_mode
+            ));
+            let cancelled = pin!(cancel.cancelled());
+            let res = select(query, cancelled).await;
+            match res {
+                Either::Left((Ok((status, results)), _cancelled)) => {
+                    discard.check_idle(status);
+                    results
                }
-                // wait for the query cancellation
-                match time::timeout(time::Duration::from_millis(100), query).await {
-                    // query successed before it was cancelled.
-                    Ok(Ok((status, results))) => {
-                        discard.check_idle(status);
-                        Ok(results)
+                Either::Left((Err(e), _cancelled)) => {
+                    discard.discard();
+                    return Err(e);
+                }
+                Either::Right((_cancelled, query)) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
                    }
-                    // query failed or was cancelled.
-                    Ok(Err(error)) => {
-                        let db_error = match &error {
-                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
-                            | SqlOverHttpError::Postgres(e) => e.as_db_error(),
-                            _ => None,
-                        };
-
-                        // if errored for some other reason, it might not be safe to return
-                        if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
-                            discard.discard();
+                    match time::timeout(time::Duration::from_millis(100), query).await {
+                        Ok(Ok((status, results))) => {
+                            discard.check_idle(status);
+                            results
                        }
+                        Ok(Err(error)) => {
+                            let db_error = match &error {
+                                SqlOverHttpError::ConnectCompute(
+                                    HttpConnError::ConnectionError(e),
+                                )
+                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                                _ => None,
+                            };

-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
-                    }
-                    Err(_timeout) => {
-                        discard.discard();
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                            // if errored for some other reason, it might not be safe to return
+                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
+                                discard.discard();
+                            }
+
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+                        }
+                        Err(_timeout) => {
+                            discard.discard();
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+                        }
                    }
                }
            }
-        };
-        res
-    }
-}
-
-impl BatchQueryData {
-    async fn process(
-        self,
-        cancel: CancellationToken,
-        client: &mut Client<tokio_postgres::Client>,
-        parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
-        info!("starting transaction");
-        let (inner, mut discard) = client.inner();
-        let cancel_token = inner.cancel_token();
-        let mut builder = inner.build_transaction();
-        if let Some(isolation_level) = parsed_headers.txn_isolation_level {
-            builder = builder.isolation_level(isolation_level);
-        }
-        if parsed_headers.txn_read_only {
-            builder = builder.read_only(true);
-        }
-        if parsed_headers.txn_deferrable {
-            builder = builder.deferrable(true);
        }
+        Payload::Batch(statements) => {
+            info!("starting transaction");
+            let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
+            let mut builder = inner.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
+            }
+            if txn_read_only {
+                builder = builder.read_only(true);
+            }
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }

-        let transaction = builder.start().await.map_err(|e| {
-            // if we cannot start a transaction, we should return immediately
-            // and not return to the pool. connection is clearly broken
-            discard.discard();
-            e
-        })?;
+            let transaction = builder.start().await.map_err(|e| {
+                // if we cannot start a transaction, we should return immediately
+                // and not return to the pool. connection is clearly broken
+                discard.discard();
+                e
+            })?;

-        let results =
-            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
+            let results = match query_batch(
+                cancel.child_token(),
+                &transaction,
+                statements,
+                &mut size,
+                raw_output,
+                default_array_mode,
+            )
+            .await
+            {
                Ok(results) => {
                    info!("commit");
                    let status = transaction.commit().await.map_err(|e| {
@@ -738,15 +659,44 @@ impl BatchQueryData {
                }
            };

-        Ok(json!({ "results": results }))
-    }
+            if txn_read_only {
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
+            }
+            if txn_deferrable {
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+            }
+            json!({ "results": results })
+        }
+    };
+
+    let metrics = client.metrics();
+
+    // how could this possibly fail
+    let body = serde_json::to_string(&result).expect("json serialization should not fail");
+    let len = body.len();
+    let response = response
+        .body(Body::from(body))
+        // only fails if invalid status code or invalid header/values are given.
+        // these are not user configurable so it cannot fail dynamically
+        .expect("building response payload should not fail");
+
+    // count the egress bytes - we miss the TLS and header overhead but oh well...
+    // moving this later in the stack is going to be a lot of effort and ehhhh
+    metrics.record_egress(len as u64);
+
+    Ok(response)
 }

 async fn query_batch(
    cancel: CancellationToken,
    transaction: &Transaction<'_>,
    queries: BatchQueryData,
-    parsed_headers: HttpHeaders,
+    total_size: &mut usize,
+    raw_output: bool,
+    array_mode: bool,
 ) -> Result<Vec<Value>, SqlOverHttpError> {
    let mut results = Vec::with_capacity(queries.queries.len());
    let mut current_size = 0;
@@ -755,7 +705,8 @@ async fn query_batch(
            transaction,
            stmt,
            &mut current_size,
-            parsed_headers,
+            raw_output,
+            array_mode
        ));
        let cancelled = pin!(cancel.cancelled());
        let res = select(query, cancelled).await;
@@ -772,6 +723,7 @@ async fn query_batch(
            }
        }
    }
+    *total_size += current_size;
    Ok(results)
 }

@@ -779,7 +731,8 @@ async fn query_to_json<T: GenericClient>(
    client: &T,
    data: QueryData,
    current_size: &mut usize,
-    parsed_headers: HttpHeaders,
+    raw_output: bool,
+    default_array_mode: bool,
 ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
    info!("executing query");
    let query_params = data.params;
@@ -839,12 +792,12 @@ async fn query_to_json<T: GenericClient>(
        columns.push(client.get_type(c.type_oid()).await?);
    }

-    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
+    let array_mode = data.array_mode.unwrap_or(default_array_mode);

    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

    // resulting JSON format is based on the format of node-postgres result
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,34 +1,20 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{
-    config::{MetricBackupCollectionConfig, MetricCollectionConfig},
-    context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    http, BranchId, EndpointId,
-};
-use anyhow::Context;
-use async_compression::tokio::write::GzipEncoder;
-use bytes::Bytes;
-use chrono::{DateTime, Datelike, Timelike, Utc};
+use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
-use futures::future::select;
 use once_cell::sync::Lazy;
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
 use std::{
    convert::Infallible,
-    pin::pin,
    sync::{
        atomic::{AtomicU64, AtomicUsize, Ordering},
        Arc,
    },
    time::Duration,
 };
-use tokio::io::AsyncWriteExt;
-use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace};
-use utils::backoff;
-use uuid::{NoContext, Timestamp};

 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

@@ -47,93 +33,19 @@ pub struct Ids {
    pub branch_id: BranchId,
 }

-pub trait MetricCounterRecorder {
-    /// Record that some bytes were sent from the proxy to the client
-    fn record_egress(&self, bytes: u64);
-    /// Record that some connections were opened
-    fn record_connection(&self, count: usize);
-}
-
-trait MetricCounterReporter {
-    fn get_metrics(&mut self) -> (u64, usize);
-    fn move_metrics(&self) -> (u64, usize);
-}
-
-#[derive(Debug)]
-struct MetricBackupCounter {
-    transmitted: AtomicU64,
-    opened_connections: AtomicUsize,
-}
-
-impl MetricCounterRecorder for MetricBackupCounter {
-    fn record_egress(&self, bytes: u64) {
-        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-    }
-
-    fn record_connection(&self, count: usize) {
-        self.opened_connections.fetch_add(count, Ordering::AcqRel);
-    }
-}
-
-impl MetricCounterReporter for MetricBackupCounter {
-    fn get_metrics(&mut self) -> (u64, usize) {
-        (
-            *self.transmitted.get_mut(),
-            *self.opened_connections.get_mut(),
-        )
-    }
-    fn move_metrics(&self) -> (u64, usize) {
-        (
-            self.transmitted.swap(0, Ordering::AcqRel),
-            self.opened_connections.swap(0, Ordering::AcqRel),
-        )
-    }
-}
-
 #[derive(Debug)]
 pub struct MetricCounter {
    transmitted: AtomicU64,
    opened_connections: AtomicUsize,
-    backup: Arc<MetricBackupCounter>,
 }

-impl MetricCounterRecorder for MetricCounter {
+impl MetricCounter {
    /// Record that some bytes were sent from the proxy to the client
-    fn record_egress(&self, bytes: u64) {
+    pub fn record_egress(&self, bytes: u64) {
        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-        self.backup.record_egress(bytes);
    }

-    /// Record that some connections were opened
-    fn record_connection(&self, count: usize) {
-        self.opened_connections.fetch_add(count, Ordering::AcqRel);
-        self.backup.record_connection(count);
-    }
-}
-
-impl MetricCounterReporter for MetricCounter {
-    fn get_metrics(&mut self) -> (u64, usize) {
-        (
-            *self.transmitted.get_mut(),
-            *self.opened_connections.get_mut(),
-        )
-    }
-    fn move_metrics(&self) -> (u64, usize) {
-        (
-            self.transmitted.swap(0, Ordering::AcqRel),
-            self.opened_connections.swap(0, Ordering::AcqRel),
-        )
-    }
-}
-
-trait Clearable {
    /// extract the value that should be reported
-    fn should_report(self: &Arc<Self>) -> Option<u64>;
-    /// Determine whether the counter should be cleared from the global map.
-    fn should_clear(self: &mut Arc<Self>) -> bool;
-}
-
-impl<C: MetricCounterReporter> Clearable for C {
    fn should_report(self: &Arc<Self>) -> Option<u64> {
        // heuristic to see if the branch is still open
        // if a clone happens while we are observing, the heuristic will be incorrect.
@@ -142,12 +54,13 @@ impl<C: MetricCounterReporter> Clearable for C {
        // However, for the strong count to be 1 it must have occured that at one instant
        // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
        let is_open = Arc::strong_count(self) > 1;
+        let opened = self.opened_connections.swap(0, Ordering::AcqRel);

        // update cached metrics eagerly, even if they can't get sent
        // (to avoid sending the same metrics twice)
        // see the relevant discussion on why to do so even if the status is not success:
        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        let (value, opened) = self.move_metrics();
+        let value = self.transmitted.swap(0, Ordering::AcqRel);

        // Our only requirement is that we report in every interval if there was an open connection
        // if there were no opened connections since, then we don't need to report
@@ -157,12 +70,15 @@ impl<C: MetricCounterReporter> Clearable for C {
            Some(value)
        }
    }
+
+    /// Determine whether the counter should be cleared from the global map.
    fn should_clear(self: &mut Arc<Self>) -> bool {
        // we can't clear this entry if it's acquired elsewhere
        let Some(counter) = Arc::get_mut(self) else {
            return false;
        };
-        let (opened, value) = counter.get_metrics();
+        let opened = *counter.opened_connections.get_mut();
+        let value = *counter.transmitted.get_mut();
        // clear if there's no data to report
        value == 0 && opened == 0
    }
@@ -174,26 +90,11 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 #[derive(Default)]
 pub struct Metrics {
    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
-    backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }

 impl Metrics {
    /// Register a new byte metrics counter for this endpoint
    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
-        let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
-            entry.clone()
-        } else {
-            self.backup_endpoints
-                .entry(ids.clone())
-                .or_insert_with(|| {
-                    Arc::new(MetricBackupCounter {
-                        transmitted: AtomicU64::new(0),
-                        opened_connections: AtomicUsize::new(0),
-                    })
-                })
-                .clone()
-        };
-
        let entry = if let Some(entry) = self.endpoints.get(&ids) {
            entry.clone()
        } else {
@@ -203,13 +104,12 @@ impl Metrics {
                    Arc::new(MetricCounter {
                        transmitted: AtomicU64::new(0),
                        opened_connections: AtomicUsize::new(0),
-                        backup: backup.clone(),
                    })
                })
                .clone()
        };

-        entry.record_connection(1);
+        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
        entry
    }
 }
@@ -232,7 +132,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall

        let now = Utc::now();
        collect_metrics_iteration(
-            &USAGE_METRICS.endpoints,
+            &USAGE_METRICS,
            &http_client,
            &config.endpoint,
            &hostname,
@@ -244,66 +144,9 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
    }
 }

-fn collect_and_clear_metrics<C: Clearable>(
-    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
-) -> Vec<(Ids, u64)> {
-    let mut metrics_to_clear = Vec::new();
-
-    let metrics_to_send: Vec<(Ids, u64)> = endpoints
-        .iter()
-        .filter_map(|counter| {
-            let key = counter.key().clone();
-            let Some(value) = counter.should_report() else {
-                metrics_to_clear.push(key);
-                return None;
-            };
-            Some((key, value))
-        })
-        .collect();
-
-    for metric in metrics_to_clear {
-        match endpoints.entry(metric) {
-            Entry::Occupied(mut counter) => {
-                if counter.get_mut().should_clear() {
-                    counter.remove_entry();
-                }
-            }
-            Entry::Vacant(_) => {}
-        }
-    }
-    metrics_to_send
-}
-
-fn create_event_chunks<'a>(
-    metrics_to_send: &'a [(Ids, u64)],
-    hostname: &'a str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-    chunk_size: usize,
-) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
-    metrics_to_send
-        .chunks(chunk_size)
-        .map(move |chunk| EventChunk {
-            events: chunk
-                .iter()
-                .map(|(ids, value)| Event {
-                    kind: EventType::Incremental {
-                        start_time: prev,
-                        stop_time: now,
-                    },
-                    metric: PROXY_IO_BYTES_PER_CLIENT,
-                    idempotency_key: idempotency_key(hostname),
-                    value: *value,
-                    extra: ids.clone(),
-                })
-                .collect(),
-        })
-}
-
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    metrics: &Metrics,
    client: &http::ClientWithMiddleware,
    metric_collection_endpoint: &reqwest::Url,
    hostname: &str,
@@ -315,17 +158,48 @@ async fn collect_metrics_iteration(
        metric_collection_endpoint
    );

-    let metrics_to_send = collect_and_clear_metrics(endpoints);
+    let mut metrics_to_clear = Vec::new();
+
+    let metrics_to_send: Vec<(Ids, u64)> = metrics
+        .endpoints
+        .iter()
+        .filter_map(|counter| {
+            let key = counter.key().clone();
+            let Some(value) = counter.should_report() else {
+                metrics_to_clear.push(key);
+                return None;
+            };
+            Some((key, value))
+        })
+        .collect();

    if metrics_to_send.is_empty() {
        trace!("no new metrics to send");
    }

    // Send metrics.
-    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
+        let events = chunk
+            .iter()
+            .map(|(ids, value)| Event {
+                kind: EventType::Incremental {
+                    start_time: prev,
+                    stop_time: now,
+                },
+                metric: PROXY_IO_BYTES_PER_CLIENT,
+                idempotency_key: idempotency_key(hostname),
+                value: *value,
+                extra: Ids {
+                    endpoint_id: ids.endpoint_id.clone(),
+                    branch_id: ids.branch_id.clone(),
+                },
+            })
+            .collect();
+
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&chunk)
+            .json(&EventChunk { events })
            .send()
            .await;

@@ -339,144 +213,25 @@ async fn collect_metrics_iteration(

        if !res.status().is_success() {
            error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
+            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
                // Report if the metric value is suspiciously large
                error!("potentially abnormal metric value: {:?}", metric);
            }
        }
    }
-}

-pub async fn task_backup(
-    backup_config: &MetricBackupCollectionConfig,
-    cancellation_token: CancellationToken,
-) -> anyhow::Result<()> {
-    info!("metrics backup config: {backup_config:?}");
-    scopeguard::defer! {
-        info!("metrics backup has shut down");
-    }
-    // Even if the remote storage is not configured, we still want to clear the metrics.
-    let storage = backup_config
-        .remote_storage_config
-        .as_ref()
-        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
-        .transpose()?;
-    let mut ticker = tokio::time::interval(backup_config.interval);
-    let mut prev = Utc::now();
-    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
-    loop {
-        select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
-        let now = Utc::now();
-        collect_metrics_backup_iteration(
-            &USAGE_METRICS.backup_endpoints,
-            &storage,
-            &hostname,
-            prev,
-            now,
-            backup_config.chunk_size,
-        )
-        .await;
-
-        prev = now;
-        if cancellation_token.is_cancelled() {
-            info!("metrics backup has been cancelled");
-            break;
-        }
-    }
-    Ok(())
-}
-
-#[instrument(skip_all)]
-async fn collect_metrics_backup_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
-    storage: &Option<GenericRemoteStorage>,
-    hostname: &str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-    chunk_size: usize,
-) {
-    let year = now.year();
-    let month = now.month();
-    let day = now.day();
-    let hour = now.hour();
-    let minute = now.minute();
-    let second = now.second();
-    let cancel = CancellationToken::new();
-
-    info!("starting collect_metrics_backup_iteration");
-
-    let metrics_to_send = collect_and_clear_metrics(endpoints);
-
-    if metrics_to_send.is_empty() {
-        trace!("no new metrics to send");
-    }
-
-    // Send metrics.
-    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
-        let real_now = Utc::now();
-        let id = uuid::Uuid::new_v7(Timestamp::from_unix(
-            NoContext,
-            real_now.second().into(),
-            real_now.nanosecond(),
-        ));
-        let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
-        let remote_path = match RemotePath::from_string(&path) {
-            Ok(remote_path) => remote_path,
-            Err(e) => {
-                error!("failed to create remote path from str {path}: {:?}", e);
-                continue;
+    for metric in metrics_to_clear {
+        match metrics.endpoints.entry(metric) {
+            Entry::Occupied(mut counter) => {
+                if counter.get_mut().should_clear() {
+                    counter.remove_entry();
+                }
            }
-        };
-
-        let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
-
-        if let Err(e) = res {
-            error!(
-                "failed to upload consumption events to remote storage: {:?}",
-                e
-            );
+            Entry::Vacant(_) => {}
        }
    }
 }

-async fn upload_events_chunk(
-    storage: &Option<GenericRemoteStorage>,
-    chunk: EventChunk<'_, Event<Ids, &'static str>>,
-    remote_path: &RemotePath,
-    cancel: &CancellationToken,
-) -> anyhow::Result<()> {
-    let storage = match storage {
-        Some(storage) => storage,
-        None => {
-            error!("no remote storage configured");
-            return Ok(());
-        }
-    };
-    let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
-    let mut encoder = GzipEncoder::new(Vec::new());
-    encoder.write_all(&data).await.context("compress metrics")?;
-    encoder.shutdown().await.context("compress metrics")?;
-    let compressed_data: Bytes = encoder.get_ref().clone().into();
-    backoff::retry(
-        || async {
-            let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
-            storage
-                .upload(stream, compressed_data.len(), remote_path, None, cancel)
-                .await
-        },
-        TimeoutOrCancel::caused_by_cancel,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_UPLOAD_MAX_RETRIES,
-        "request_data_upload",
-        cancel,
-    )
-    .await
-    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-    .and_then(|x| x)
-    .context("request_data_upload")?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
    use std::{
@@ -493,7 +248,7 @@ mod tests {
    };
    use url::Url;

-    use super::*;
+    use super::{collect_metrics_iteration, Ids, Metrics};
    use crate::{http, rate_limiter::RateLimiterConfig};

    #[tokio::test]
@@ -529,19 +284,18 @@ mod tests {
        let now = Utc::now();

        // no counters have been registered
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert!(r.is_empty());

        // register a new counter
-
        let counter = metrics.register(Ids {
            endpoint_id: "e1".into(),
            branch_id: "b1".into(),
        });

        // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].events.len(), 1);
@@ -551,7 +305,7 @@ mod tests {
        counter.record_egress(1);

        // egress should be observered
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].events.len(), 1);
@@ -561,19 +315,11 @@ mod tests {
        drop(counter);

        // we do not observe the counter
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
        let r = std::mem::take(&mut *reports2.lock().unwrap());
        assert!(r.is_empty());

        // counter is unregistered
        assert!(metrics.endpoints.is_empty());
-
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
-            .await;
-        assert!(!metrics.backup_endpoints.is_empty());
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
-            .await;
-        // backup counter is unregistered after the second iteration
-        assert!(metrics.backup_endpoints.is_empty());
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,5 +94,4 @@ select = [
    "I", # isort
    "W", # pycodestyle
    "B", # bugbear
-    "UP032", # f-string
 ]
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -33,7 +33,6 @@ once_cell.workspace = true
 parking_lot.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
-rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -28,7 +28,7 @@ use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
@@ -170,13 +170,6 @@ struct Args {
    /// still needed for existing replication connection.
    #[arg(long)]
    walsenders_keep_horizon: bool,
-    /// Enable partial backup. If disabled, safekeeper will not upload partial
-    /// segments to remote storage.
-    #[arg(long)]
-    partial_backup_enabled: bool,
-    /// Controls how long backup will wait until uploading the partial segment.
-    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
-    partial_backup_timeout: Duration,
 }

 // Like PathBufValueParser, but allows empty string.
@@ -307,8 +300,6 @@ async fn main() -> anyhow::Result<()> {
        http_auth,
        current_thread_runtime: args.current_thread_runtime,
        walsenders_keep_horizon: args.walsenders_keep_horizon,
-        partial_backup_enabled: args.partial_backup_enabled,
-        partial_backup_timeout: args.partial_backup_timeout,
    };

    // initialize sentry if SENTRY_DSN is provided
@@ -374,8 +365,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {

    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);

-    wal_backup::init_remote_storage(&conf);
-
    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
        FuturesUnordered::new();
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 use crate::SafeKeeperConf;

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 8;
+pub const SK_FORMAT_VERSION: u32 = 7;

 // contains persistent metadata for safekeeper
 const CONTROL_FILE_NAME: &str = "safekeeper.control";
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -2,7 +2,6 @@
 use crate::{
    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
    state::{PersistedPeers, TimelinePersistentState},
-    wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
@@ -139,50 +138,6 @@ pub struct SafeKeeperStateV4 {
    pub peers: PersistedPeers,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct SafeKeeperStateV7 {
-    #[serde(with = "hex")]
-    pub tenant_id: TenantId,
-    #[serde(with = "hex")]
-    pub timeline_id: TimelineId,
-    /// persistent acceptor state
-    pub acceptor_state: AcceptorState,
-    /// information about server
-    pub server: ServerInfo,
-    /// Unique id of the last *elected* proposer we dealt with. Not needed
-    /// for correctness, exists for monitoring purposes.
-    #[serde(with = "hex")]
-    pub proposer_uuid: PgUuid,
-    /// Since which LSN this timeline generally starts. Safekeeper might have
-    /// joined later.
-    pub timeline_start_lsn: Lsn,
-    /// Since which LSN safekeeper has (had) WAL for this timeline.
-    /// All WAL segments next to one containing local_start_lsn are
-    /// filled with data from the beginning.
-    pub local_start_lsn: Lsn,
-    /// Part of WAL acknowledged by quorum *and available locally*. Always points
-    /// to record boundary.
-    pub commit_lsn: Lsn,
-    /// LSN that points to the end of the last backed up segment. Useful to
-    /// persist to avoid finding out offloading progress on boot.
-    pub backup_lsn: Lsn,
-    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
-    /// of last record streamed to everyone). Persisting it helps skipping
-    /// recovery in walproposer, generally we compute it from peers. In
-    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
-    /// only by walproposer.
-    pub peer_horizon_lsn: Lsn,
-    /// LSN of the oldest known checkpoint made by pageserver and successfully
-    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
-    /// informational purposes, we receive it from pageserver (or broker).
-    pub remote_consistent_lsn: Lsn,
-    // Peers and their state as we remember it. Knowing peers themselves is
-    // fundamental; but state is saved here only for informational purposes and
-    // obviously can be stale. (Currently not saved at all, but let's provision
-    // place to have less file version upgrades).
-    pub peers: PersistedPeers,
-}
-
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
    // migrate to storing full term history
    if version == 1 {
@@ -212,7 +167,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
-            partial_backup: wal_backup_partial::State::default(),
        });
    // migrate to hexing some ids
    } else if version == 2 {
@@ -236,7 +190,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
-            partial_backup: wal_backup_partial::State::default(),
        });
    // migrate to moving tenant_id/timeline_id to the top and adding some lsns
    } else if version == 3 {
@@ -260,7 +213,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
-            partial_backup: wal_backup_partial::State::default(),
        });
    // migrate to having timeline_start_lsn
    } else if version == 4 {
@@ -284,7 +236,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            peer_horizon_lsn: oldstate.peer_horizon_lsn,
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
-            partial_backup: wal_backup_partial::State::default(),
        });
    } else if version == 5 {
        info!("reading safekeeper control file version {}", version);
@@ -311,30 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
        oldstate.server.pg_version = 140005;

        return Ok(oldstate);
-    } else if version == 7 {
-        info!("reading safekeeper control file version {}", version);
-        let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?;
-
-        return Ok(TimelinePersistentState {
-            tenant_id: oldstate.tenant_id,
-            timeline_id: oldstate.timeline_id,
-            acceptor_state: oldstate.acceptor_state,
-            server: oldstate.server,
-            proposer_uuid: oldstate.proposer_uuid,
-            timeline_start_lsn: oldstate.timeline_start_lsn,
-            local_start_lsn: oldstate.local_start_lsn,
-            commit_lsn: oldstate.commit_lsn,
-            backup_lsn: oldstate.backup_lsn,
-            peer_horizon_lsn: oldstate.peer_horizon_lsn,
-            remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
-            partial_backup: wal_backup_partial::State::default(),
-        });
    }
-
-    // TODO: persist the file back to the disk after upgrade
-    // TODO: think about backward compatibility and rollbacks
-
    bail!("unsupported safekeeper control file version {}", version)
 }

--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -32,7 +32,6 @@ pub mod send_wal;
 pub mod state;
 pub mod timeline;
 pub mod wal_backup;
-pub mod wal_backup_partial;
 pub mod wal_service;
 pub mod wal_storage;

@@ -49,7 +48,6 @@ pub mod defaults {

    pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
    pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
-    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
 }

 #[derive(Debug, Clone)]
@@ -81,8 +79,6 @@ pub struct SafeKeeperConf {
    pub http_auth: Option<Arc<SwappableJwtAuth>>,
    pub current_thread_runtime: bool,
    pub walsenders_keep_horizon: bool,
-    pub partial_backup_enabled: bool,
-    pub partial_backup_timeout: Duration,
 }

 impl SafeKeeperConf {
@@ -127,8 +123,6 @@ impl SafeKeeperConf {
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
            walsenders_keep_horizon: false,
-            partial_backup_enabled: false,
-            partial_backup_timeout: Duration::from_secs(0),
        }
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -147,21 +147,6 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
 });
-pub static PARTIAL_BACKUP_UPLOADS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "safekeeper_partial_backup_uploads_total",
-        "Number of partial backup uploads to the S3",
-        &["result"]
-    )
-    .expect("Failed to register safekeeper_partial_backup_uploads_total counter")
-});
-pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "safekeeper_partial_backup_uploaded_bytes_total",
-        "Number of bytes uploaded to the S3 during partial backup"
-    )
-    .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
-});

 pub const LABEL_UNKNOWN: &str = "unknown";

--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1221,7 +1221,6 @@ mod tests {
                    commit_lsn: Lsn(1234567600),
                },
            )]),
-            partial_backup: crate::wal_backup_partial::State::default(),
        };

        let ser = state.ser().unwrap();
@@ -1267,8 +1266,6 @@ mod tests {
            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // partial_backup
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        ];

        assert_eq!(Hex(&ser), Hex(&expected));
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -13,7 +13,6 @@ use utils::{
 use crate::{
    control_file,
    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
-    wal_backup_partial::{self},
 };

 /// Persistent information stored on safekeeper node about timeline.
@@ -55,14 +54,11 @@ pub struct TimelinePersistentState {
    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
    /// informational purposes, we receive it from pageserver (or broker).
    pub remote_consistent_lsn: Lsn,
-    /// Peers and their state as we remember it. Knowing peers themselves is
-    /// fundamental; but state is saved here only for informational purposes and
-    /// obviously can be stale. (Currently not saved at all, but let's provision
-    /// place to have less file version upgrades).
+    // Peers and their state as we remember it. Knowing peers themselves is
+    // fundamental; but state is saved here only for informational purposes and
+    // obviously can be stale. (Currently not saved at all, but let's provision
+    // place to have less file version upgrades).
    pub peers: PersistedPeers,
-    /// Holds names of partial segments uploaded to remote storage. Used to
-    /// clean up old objects without leaving garbage in remote storage.
-    pub partial_backup: wal_backup_partial::State,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -97,7 +93,6 @@ impl TimelinePersistentState {
                    .map(|p| (*p, PersistedPeerInfo::new()))
                    .collect(),
            ),
-            partial_backup: wal_backup_partial::State::default(),
        }
    }

--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_backup_partial, wal_storage};
+use crate::{debug_dump, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};

 /// Things safekeeper should know about timeline state on peers.
@@ -503,9 +503,6 @@ impl Timeline {
        if conf.peer_recovery_enabled {
            tokio::spawn(recovery_main(self.clone(), conf.clone()));
        }
-        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
-        }
    }

    /// Delete timeline from disk completely, by removing timeline directory.
@@ -670,8 +667,8 @@ impl Timeline {
            term_flush_lsn =
                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
        }
-        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
        self.commit_lsn_watch_tx.send(commit_lsn)?;
+        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
        Ok(rmsg)
    }

--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;

 use tokio::select;
@@ -180,16 +180,6 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
        .unwrap()
 }

-pub fn init_remote_storage(conf: &SafeKeeperConf) {
-    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
-    // dependencies to all tasks instead.
-    REMOTE_STORAGE.get_or_init(|| {
-        conf.remote_storage
-            .as_ref()
-            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
-    });
-}
-
 const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;

 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -204,6 +194,14 @@ pub async fn wal_backup_launcher_task_main(
        conf.remote_storage
    );

+    let conf_ = conf.clone();
+    REMOTE_STORAGE.get_or_init(|| {
+        conf_
+            .remote_storage
+            .as_ref()
+            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
+    });
+
    // Presence in this map means launcher is aware s3 offloading is needed for
    // the timeline, but task is started only if it makes sense for to offload
    // from this safekeeper.
@@ -520,35 +518,6 @@ async fn backup_object(
        .await
 }

-pub(crate) async fn backup_partial_segment(
-    source_file: &Utf8Path,
-    target_file: &RemotePath,
-    size: usize,
-) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
-    let file = File::open(&source_file)
-        .await
-        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
-
-    // limiting the file to read only the first `size` bytes
-    let limited_file = tokio::io::AsyncReadExt::take(file, size as u64);
-
-    let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE);
-
-    let cancel = CancellationToken::new();
-
-    storage
-        .upload(
-            file,
-            size,
-            target_file,
-            Some(StorageMetadata::from([("sk_type", "partial_segment")])),
-            &cancel,
-        )
-        .await
-}
-
 pub async fn read_object(
    file_path: &RemotePath,
    offset: u64,
@@ -635,13 +604,6 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    Ok(())
 }

-/// Used by wal_backup_partial.
-pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
-    let cancel = CancellationToken::new(); // not really used
-    let storage = get_configured_remote_storage();
-    storage.delete_objects(paths, &cancel).await
-}
-
 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
    wal_seg_size: usize,
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,396 +0,0 @@
-//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
-//! was changed), the segment will be uploaded to S3 in about 15 minutes.
-//!
-//! The filename format for partial segments is
-//! `Segment_Term_Flush_Commit_skNN.partial`, where:
-//! - `Segment` – the segment name, like `000000010000000000000001`
-//! - `Term` – current term
-//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
-//! - `Commit` – commit_lsn in the same hex format
-//! - `NN` – safekeeper_id, like `1`
-//!
-//! The full object name example:
-//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`
-//!
-//! Each safekeeper will keep info about remote partial segments in its control
-//! file. Code updates state in the control file before doing any S3 operations.
-//! This way control file stores information about all potentially existing
-//! remote partial segments and can clean them up after uploading a newer version.
-
-use std::sync::Arc;
-
-use camino::Utf8PathBuf;
-use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
-use rand::Rng;
-use remote_storage::RemotePath;
-use serde::{Deserialize, Serialize};
-
-use tracing::{debug, error, info, instrument};
-use utils::lsn::Lsn;
-
-use crate::{
-    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
-    safekeeper::Term,
-    timeline::Timeline,
-    wal_backup, SafeKeeperConf,
-};
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub enum UploadStatus {
-    /// Upload is in progress
-    InProgress,
-    /// Upload is finished
-    Uploaded,
-    /// Deletion is in progress
-    Deleting,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct PartialRemoteSegment {
-    pub status: UploadStatus,
-    pub name: String,
-    pub commit_lsn: Lsn,
-    pub flush_lsn: Lsn,
-    pub term: Term,
-}
-
-impl PartialRemoteSegment {
-    fn eq_without_status(&self, other: &Self) -> bool {
-        self.name == other.name
-            && self.commit_lsn == other.commit_lsn
-            && self.flush_lsn == other.flush_lsn
-            && self.term == other.term
-    }
-}
-
-// NB: these structures are a part of a control_file, you can't change them without
-// changing the control file format version.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
-pub struct State {
-    pub segments: Vec<PartialRemoteSegment>,
-}
-
-impl State {
-    /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
-    fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
-        self.segments
-            .iter()
-            .find(|seg| seg.status == UploadStatus::Uploaded)
-            .cloned()
-    }
-}
-
-struct PartialBackup {
-    wal_seg_size: usize,
-    tli: Arc<Timeline>,
-    conf: SafeKeeperConf,
-    local_prefix: Utf8PathBuf,
-    remote_prefix: Utf8PathBuf,
-
-    state: State,
-}
-
-// Read-only methods for getting segment names
-impl PartialBackup {
-    fn segno(&self, lsn: Lsn) -> XLogSegNo {
-        lsn.segment_number(self.wal_seg_size)
-    }
-
-    fn segment_name(&self, segno: u64) -> String {
-        XLogFileName(PG_TLI, segno, self.wal_seg_size)
-    }
-
-    fn remote_segment_name(
-        &self,
-        segno: u64,
-        term: u64,
-        commit_lsn: Lsn,
-        flush_lsn: Lsn,
-    ) -> String {
-        format!(
-            "{}_{}_{:016X}_{:016X}_sk{}.partial",
-            self.segment_name(segno),
-            term,
-            flush_lsn.0,
-            commit_lsn.0,
-            self.conf.my_id.0,
-        )
-    }
-
-    fn local_segment_name(&self, segno: u64) -> String {
-        format!("{}.partial", self.segment_name(segno))
-    }
-}
-
-impl PartialBackup {
-    /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded.
-    async fn prepare_upload(&self) -> PartialRemoteSegment {
-        // this operation takes a lock to get the actual state
-        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
-        let flush_lsn = Lsn(sk_info.flush_lsn);
-        let commit_lsn = Lsn(sk_info.commit_lsn);
-        let term = sk_info.term;
-        let segno = self.segno(flush_lsn);
-
-        let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
-
-        PartialRemoteSegment {
-            status: UploadStatus::InProgress,
-            name,
-            commit_lsn,
-            flush_lsn,
-            term,
-        }
-    }
-
-    /// Reads segment from disk and uploads it to the remote storage.
-    async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> {
-        let flush_lsn = prepared.flush_lsn;
-        let segno = self.segno(flush_lsn);
-
-        // We're going to backup bytes from the start of the segment up to flush_lsn.
-        let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
-
-        let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
-
-        // Upload first `backup_bytes` bytes of the segment to the remote storage.
-        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
-        PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
-
-        // We uploaded the segment, now let's verify that the data is still actual.
-        // If the term changed, we cannot guarantee the validity of the uploaded data.
-        // If the term is the same, we know the data is not corrupted.
-        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
-        if sk_info.term != prepared.term {
-            anyhow::bail!("term changed during upload");
-        }
-        assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
-        assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn));
-
-        Ok(())
-    }
-
-    /// Write new state to disk. If in-memory and on-disk states diverged, returns an error.
-    async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> {
-        self.tli
-            .map_control_file(|cf| {
-                if cf.partial_backup != self.state {
-                    let memory = self.state.clone();
-                    self.state = cf.partial_backup.clone();
-                    anyhow::bail!(
-                        "partial backup state diverged, memory={:?}, disk={:?}",
-                        memory,
-                        cf.partial_backup
-                    );
-                }
-
-                cf.partial_backup = new_state.clone();
-                Ok(())
-            })
-            .await?;
-        // update in-memory state
-        self.state = new_state;
-        Ok(())
-    }
-
-    /// Upload the latest version of the partial segment and garbage collect older versions.
-    #[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
-    async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
-        info!("starting upload {:?}", prepared);
-
-        let state_0 = self.state.clone();
-        let state_1 = {
-            let mut state = state_0.clone();
-            state.segments.push(prepared.clone());
-            state
-        };
-
-        // we're going to upload a new segment, let's write it to disk to make GC later
-        self.commit_state(state_1).await?;
-
-        self.upload_segment(prepared.clone()).await?;
-
-        let state_2 = {
-            let mut state = state_0.clone();
-            for seg in state.segments.iter_mut() {
-                seg.status = UploadStatus::Deleting;
-            }
-            let mut actual_remote_segment = prepared.clone();
-            actual_remote_segment.status = UploadStatus::Uploaded;
-            state.segments.push(actual_remote_segment);
-            state
-        };
-
-        // we've uploaded new segment, it's actual, all other segments should be GCed
-        self.commit_state(state_2).await?;
-        self.gc().await?;
-
-        Ok(())
-    }
-
-    /// Delete all non-Uploaded segments from the remote storage. There should be only one
-    /// Uploaded segment at a time.
-    #[instrument(name = "gc", skip_all)]
-    async fn gc(&mut self) -> anyhow::Result<()> {
-        let mut segments_to_delete = vec![];
-
-        let new_segments: Vec<PartialRemoteSegment> = self
-            .state
-            .segments
-            .iter()
-            .filter_map(|seg| {
-                if seg.status == UploadStatus::Uploaded {
-                    Some(seg.clone())
-                } else {
-                    segments_to_delete.push(seg.name.clone());
-                    None
-                }
-            })
-            .collect();
-
-        info!("deleting objects: {:?}", segments_to_delete);
-        let mut objects_to_delete = vec![];
-        for seg in segments_to_delete.iter() {
-            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
-            objects_to_delete.push(remote_path);
-        }
-
-        // removing segments from remote storage
-        wal_backup::delete_objects(&objects_to_delete).await?;
-
-        // now we can update the state on disk
-        let new_state = {
-            let mut state = self.state.clone();
-            state.segments = new_segments;
-            state
-        };
-        self.commit_state(new_state).await?;
-
-        Ok(())
-    }
-}
-
-#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
-    debug!("started");
-    let await_duration = conf.partial_backup_timeout;
-
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
-    // sleep for random time to avoid thundering herd
-    {
-        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
-        let sleep_duration = await_duration.mul_f64(randf64);
-        tokio::time::sleep(sleep_duration).await;
-    }
-
-    let (_, persistent_state) = tli.get_state().await;
-    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
-    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
-    let wal_seg_size = tli.get_wal_seg_size().await;
-
-    let local_prefix = tli.timeline_dir.clone();
-    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
-        Ok(path) => path.to_owned(),
-        Err(e) => {
-            error!("failed to strip workspace dir prefix: {:?}", e);
-            return;
-        }
-    };
-
-    let mut backup = PartialBackup {
-        wal_seg_size,
-        tli,
-        state: persistent_state.partial_backup,
-        conf,
-        local_prefix,
-        remote_prefix,
-    };
-
-    debug!("state: {:?}", backup.state);
-
-    'outer: loop {
-        // wait until we have something to upload
-        let uploaded_segment = backup.state.uploaded_segment();
-        if let Some(seg) = &uploaded_segment {
-            // if we already uploaded something, wait until we have something new
-            while flush_lsn_rx.borrow().lsn == seg.flush_lsn
-                && *commit_lsn_rx.borrow() == seg.commit_lsn
-                && flush_lsn_rx.borrow().term == seg.term
-            {
-                tokio::select! {
-                    _ = cancellation_rx.changed() => {
-                        info!("timeline canceled");
-                        return;
-                    }
-                    _ = commit_lsn_rx.changed() => {}
-                    _ = flush_lsn_rx.changed() => {}
-                }
-            }
-        }
-
-        // fixing the segno and waiting some time to prevent reuploading the same segment too often
-        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
-        let timeout = tokio::time::sleep(await_duration);
-        tokio::pin!(timeout);
-        let mut timeout_expired = false;
-
-        // waiting until timeout expires OR segno changes
-        'inner: loop {
-            tokio::select! {
-                _ = cancellation_rx.changed() => {
-                    info!("timeline canceled");
-                    return;
-                }
-                _ = commit_lsn_rx.changed() => {}
-                _ = flush_lsn_rx.changed() => {
-                    let segno = backup.segno(flush_lsn_rx.borrow().lsn);
-                    if segno != pending_segno {
-                        // previous segment is no longer partial, aborting the wait
-                        break 'inner;
-                    }
-                }
-                _ = &mut timeout => {
-                    // timeout expired, now we are ready for upload
-                    timeout_expired = true;
-                    break 'inner;
-                }
-            }
-        }
-
-        if !timeout_expired {
-            // likely segno has changed, let's try again in the next iteration
-            continue 'outer;
-        }
-
-        let prepared = backup.prepare_upload().await;
-        if let Some(seg) = &uploaded_segment {
-            if seg.eq_without_status(&prepared) {
-                // we already uploaded this segment, nothing to do
-                continue 'outer;
-            }
-        }
-
-        match backup.do_upload(&prepared).await {
-            Ok(()) => {
-                debug!(
-                    "uploaded {} up to flush_lsn {}",
-                    prepared.name, prepared.flush_lsn
-                );
-                PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc();
-            }
-            Err(e) => {
-                info!("failed to upload {}: {:#}", prepared.name, e);
-                PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc();
-            }
-        }
-    }
-}
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -176,8 +176,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        http_auth: None,
        current_thread_runtime: false,
        walsenders_keep_horizon: false,
-        partial_backup_enabled: false,
-        partial_backup_timeout: Duration::from_secs(0),
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
    Returns basepath for files with captured output.
    """
    assert isinstance(cmd, list)
-    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
+    base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
    basepath = os.path.join(capture_dir, base)
    stdout_filename = basepath + ".stdout"
    stderr_filename = basepath + ".stderr"

    with open(stdout_filename, "w") as stdout_f:
        with open(stderr_filename, "w") as stderr_f:
-            print(f'(capturing output to "{base}.stdout")')
+            print('(capturing output to "{}.stdout")'.format(base))
            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)

    return basepath
@@ -82,9 +82,11 @@ class PgBin:

    def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
        self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
+        self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin")
        self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
+        self.env["LD_LIBRARY_PATH"] = os.path.join(
+            str(pg_distrib_dir), "v{}".format(pg_version), "lib"
+        )

    def _fixpath(self, command: List[str]):
        if "/" not in command[0]:
@@ -108,7 +110,7 @@ class PgBin:
        """

        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
+        print('Running command "{}"'.format(" ".join(command)))
        env = self._build_env(env)
        subprocess.run(command, env=env, cwd=cwd, check=True)

@@ -126,7 +128,7 @@ class PgBin:
        """

        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
+        print('Running command "{}"'.format(" ".join(command)))
        env = self._build_env(env)
        return subprocess_capture(
            str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
@@ -298,7 +300,7 @@ class NeonPageserverHttpClient(requests.Session):

 def lsn_to_hex(num: int) -> str:
    """Convert lsn from int to standard hex notation."""
-    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
+    return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF)


 def lsn_from_hex(lsn_hex: str) -> int:
@@ -329,12 +331,16 @@ def wait_for_upload(
        if current_lsn >= lsn:
            return
        print(
-            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
+            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
+                lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1
+            )
        )
        time.sleep(1)

    raise Exception(
-        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
+        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
+            lsn_to_hex(lsn), lsn_to_hex(current_lsn)
+        )
    )


--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -482,18 +482,20 @@ def pytest_terminal_summary(
                terminalreporter.section("Benchmark results", "-")
                is_header_printed = True

-            terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ")
+            terminalreporter.write(
+                "{}.{}: ".format(test_report.head_line, recorded_property["name"])
+            )
            unit = recorded_property["unit"]
            value = recorded_property["value"]
            if unit == "MB":
-                terminalreporter.write(f"{value:,.0f}", green=True)
+                terminalreporter.write("{0:,.0f}".format(value), green=True)
            elif unit in ("s", "ms") and isinstance(value, float):
-                terminalreporter.write(f"{value:,.3f}", green=True)
+                terminalreporter.write("{0:,.3f}".format(value), green=True)
            elif isinstance(value, float):
-                terminalreporter.write(f"{value:,.4f}", green=True)
+                terminalreporter.write("{0:,.4f}".format(value), green=True)
            else:
                terminalreporter.write(str(value), green=True)
-            terminalreporter.line(f" {unit}")
+            terminalreporter.line(" {}".format(unit))

            result_entry.append(recorded_property)

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	4e5724d9c3	Merge pull request #7248 from neondatabase/rc/2024-03-26 Release 2024-03-26	2024-03-26 15:17:00 +00:00
John Spray	0d3e499059	Merge pull request #7219 from neondatabase/rc/2024-03-25 Release 2024-03-25	2024-03-25 12:28:09 +00:00
Arpad Müller	7b860b837c	Merge pull request #7154 from neondatabase/rc/2024-03-18 Release 2024-03-18	2024-03-19 12:07:14 +01:00
Christian Schwarz	41fc96e20f	fixup(#7160 / tokio_epoll_uring_ext): double-panic caused by info! in thread-local's drop() (#7164 ) Manual testing of the changes in #7160 revealed that, if the thread-local destructor ever runs (it apparently doesn't in our test suite runs, otherwise #7160 would not have auto-merged), we can encounter an `abort()` due to a double-panic in the tracing code. This github comment here contains the stack trace: https://github.com/neondatabase/neon/pull/7160#issuecomment-2003778176 This PR reverts #7160 and uses a atomic counter to identify the thread-local in log messages, instead of the memory address of the thread local, which may be re-used.	2024-03-18 16:28:17 +01:00
Christian Schwarz	fb2b1ce57b	fixup(#7141 / tokio_epoll_uring_ext): high frequency log message The PR #7141 added log message ``` ThreadLocalState is being dropped and id might be re-used in the future ``` which was supposed to be emitted when the thread-local is destroyed. Instead, it was emitted on _each_ call to `thread_local_system()`, ie.., on each tokio-epoll-uring operation.	2024-03-18 13:01:17 +01:00
Joonas Koivunen	464717451b	build: make procfs linux only dependency (#7156 ) the dependency refuses to build on macos so builds on `main` are broken right now, including the `release` PR.	2024-03-18 09:32:49 +00:00
Joonas Koivunen	c6ed86d3d0	Merge pull request #7081 from neondatabase/rc/2024-03-11 Release 2024-03-11	2024-03-11 14:41:39 +02:00
Roman Zaynetdinov	f0a9017008	Export db size, deadlocks and changed row metrics (#7050 ) ## Problem We want to report metrics for the oldest user database.	2024-03-11 11:55:06 +00:00
Christian Schwarz	bb7949ba00	Merge pull request #6993 from neondatabase/rc/2024-03-04 Release 2024-03-04	2024-03-04 13:08:44 +01:00
Arthur Petukhovsky	1df0f69664	Merge pull request #6973 from neondatabase/rc/2024-02-29-manual Release 2024-02-29	2024-02-29 17:26:33 +00:00
Vlad Lazar	970066a914	libs: fix expired token in auth decode test (#6963 ) The test token expired earlier today (1709200879). I regenerated the token, but without an expiration date this time.	2024-02-29 17:23:25 +00:00
Arthur Petukhovsky	1ebd3897c0	Merge pull request #6956 from neondatabase/rc/2024-02-28 Release 2024-02-28	2024-02-29 16:39:52 +00:00
Arthur Petukhovsky	6460beffcd	Merge pull request #6901 from neondatabase/rc/2024-02-26 Release 2024-02-26	2024-02-26 17:08:19 +00:00
John Spray	6f7f8958db	pageserver: only write out legacy tenant config if no generation (#6891 ) ## Problem Previously we always wrote out both legacy and modern tenant config files. The legacy write enabled rollbacks, but we are long past the point where that is needed. We still need the legacy format for situations where someone is running tenants without generations (that will be yanked as well eventually), but we can avoid writing it out at all if we do have a generation number set. We implicitly also avoid writing the legacy config if our mode is Secondary (secondary mode is newer than generations). ## Summary of changes - Make writing legacy tenant config conditional on there being no generation number set.	2024-02-26 10:25:25 +00:00
Christian Schwarz	936a00e077	pageserver: remove two obsolete/unused per-timeline metrics (#6893 ) over-compensating the addition of a new per-timeline metric in https://github.com/neondatabase/neon/pull/6834 part of https://github.com/neondatabase/neon/issues/6737	2024-02-26 09:16:24 +00:00
				`@@ -1,2 +0,0 @@`

				`ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';`