tweak comments

proxy: improve performance of leaky-bucket
2026-05-27 10:00:38 +00:00 · 2024-07-29 11:41:44 +01:00 · 2024-07-28 23:00:21 +01:00
54 changed files with 446 additions and 2012 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,8 +14,11 @@ inputs:
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
+  provisioner:
+    description: 'k8s-pod or k8s-neonvm'
+    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
@@ -34,6 +37,10 @@ runs:
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
+        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
+          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
+        fi
+
        project=$(curl \
          "https://${API_HOST}/api/v2/projects" \
          --fail \
@@ -45,7 +52,7 @@ runs:
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"k8s-neonvm\",
+              \"provisioner\": \"${PROVISIONER}\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
              \"settings\": { }
@@ -68,5 +75,6 @@ runs:
        API_KEY: ${{ inputs.api_key }}
        REGION_ID: ${{ inputs.region_id }}
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
+        PROVISIONER: ${{ inputs.provisioner }}
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,9 +63,11 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -98,6 +100,7 @@ jobs:
        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -213,11 +216,11 @@ jobs:
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
-    # - neonvm-captest-new: Freshly created project (1 CU)
-    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neon-captest-new: Freshly created project (1 CU)
+    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neonvm-captest-reuse: Reusing existing project
+    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
@@ -242,16 +245,18 @@ jobs:
            "'"$region_id_default"'"
            ],
          "platform": [
-            "neonvm-captest-new",
-            "neonvm-captest-reuse",
+            "neon-captest-new",
+            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

@@ -266,7 +271,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ]
        }'

@@ -282,7 +287,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ],
          "scale": [
            "10"
@@ -333,7 +338,7 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -341,18 +346,19 @@ jobs:
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -436,9 +442,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - PLATFORM: "neonvm-captest-pgvector"
+          - PLATFORM: "neon-captest-pgvector"
          - PLATFORM: "azure-captest-pgvector"
-
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -480,7 +486,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-pgvector)
+          neon-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
          azure-captest-pgvector)
@@ -579,7 +585,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
            ;;
          rds-aurora)
@@ -589,7 +595,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -666,7 +672,7 @@ jobs:
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
@@ -676,7 +682,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -753,7 +759,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
          rds-aurora)
@@ -763,7 +769,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -833,9 +833,6 @@ jobs:
          rm -rf .docker-custom

  promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -862,28 +859,6 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,7 +13,6 @@ on:
    paths:
      - '.github/workflows/pg-clients.yml'
      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
      - 'poetry.lock'
  workflow_dispatch:

@@ -50,77 +49,6 @@ jobs:
      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit

-  test-logical-replication:
-    needs: [ build-build-tools-image ]
-    runs-on: ubuntu-22.04
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init --user root
-    services:
-      clickhouse:
-        image: clickhouse/clickhouse-server:24.6.3.64
-        ports:
-          - 9000:9000
-          - 8123:8123
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Download Neon artifact
-        uses: ./.github/actions/download
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-          path: /tmp/neon/
-          prefix: latest
-
-      - name: Create Neon Project
-        id: create-neon-project
-        uses: ./.github/actions/neon-project-create
-        with:
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-      - name: Run tests
-        uses: ./.github/actions/run-python-test-set
-        with:
-          build_type: remote
-          test_selection: logical_repl
-          run_in_parallel: false
-          extra_params: -m remote_cluster
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        env:
-          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-
-      - name: Delete Neon Project
-        if: always()
-        uses: ./.github/actions/neon-project-delete
-        with:
-          project_id: ${{ steps.create-neon-project.outputs.project_id }}
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-      - name: Create Allure report
-        if: ${{ !cancelled() }}
-        id: create-allure-report
-        uses: ./.github/actions/allure-report-generate
-        with:
-          store-test-results-into-db: true
-        env:
-          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-
-      - name: Post to a Slack channel
-        if: github.event.schedule && failure()
-        uses: slackapi/slack-github-action@v1
-        with:
-          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
-          slack-message: |
-            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  test-postgres-client-libs:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
- "chrono",
 "diesel_derives",
 "itoa",
 "pq-sys",
@@ -1744,18 +1743,6 @@ dependencies = [
 "const-random",
 ]

-[[package]]
-name = "dns-lookup"
-version = "2.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc"
-dependencies = [
- "cfg-if",
- "libc",
- "socket2 0.5.5",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "dsl_auto_type"
 version = "0.1.1"
@@ -5731,12 +5718,10 @@ dependencies = [
 "aws-config",
 "bytes",
 "camino",
- "chrono",
 "clap",
 "control_plane",
 "diesel",
 "diesel_migrations",
- "dns-lookup",
 "fail",
 "futures",
 "git-version",
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,6 +514,7 @@ impl LocalEnv {
                #[derive(serde::Serialize, serde::Deserialize)]
                // (allow unknown fields, unlike PageServerConf)
                struct PageserverConfigTomlSubset {
+                    id: NodeId,
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
@@ -525,30 +526,18 @@ impl LocalEnv {
                        .with_context(|| format!("read {:?}", config_toml_path))?,
                )
                .context("parse pageserver.toml")?;
-                let identity_toml_path = dentry.path().join("identity.toml");
-                #[derive(serde::Serialize, serde::Deserialize)]
-                struct IdentityTomlSubset {
-                    id: NodeId,
-                }
-                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
-                    &std::fs::read_to_string(&identity_toml_path)
-                        .with_context(|| format!("read {:?}", identity_toml_path))?,
-                )
-                .context("parse identity.toml")?;
                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                } = config_toml;
-                let IdentityTomlSubset {
-                    id: identity_toml_id,
-                } = identity_toml;
                let conf = PageServerConf {
                    id: {
                        anyhow::ensure!(
-                            identity_toml_id == id,
-                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
                        );
                        id
                    },
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,13 +127,10 @@ impl PageServerNode {
        }

        // Apply the user-provided overrides
-        overrides.push({
-            let mut doc =
-                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
-            // `id` is written out to `identity.toml` instead of `pageserver.toml`
-            doc.remove("id").expect("it's part of the struct");
-            doc.to_string()
-        });
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );

        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,6 +1,5 @@
-use std::collections::HashSet;
 use std::str::FromStr;
-use std::time::{Duration, Instant};
+use std::time::Instant;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

-/// Metadata health record posted from scrubber.
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthRecord {
-    pub tenant_shard_id: TenantShardId,
-    pub healthy: bool,
-    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: HashSet<TenantShardId>,
-    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateResponse {}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListUnhealthyResponse {
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedRequest {
-    #[serde(with = "humantime_serde")]
-    pub not_scrubbed_for: Duration,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedResponse {
-    pub health_records: Vec<MetadataHealthRecord>,
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
                    .blobs()
                    .map(|k| ListingObject{
                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into(),
-                        size: k.properties.content_length,
+                        last_modified: k.properties.last_modified.into()
                    }
                    );

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -153,7 +153,6 @@ pub enum ListingMode {
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
-    pub size: u64,
 }

 #[derive(Default)]
@@ -195,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>>;

    async fn list(
        &self,
@@ -352,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
        match self {
            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
                            key: k.clone(),
                            // LocalFs is just for testing, so just specify a dummy time
                            last_modified: SystemTime::now(),
-                            size: 0,
                        })
                    }
                })
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
                            key: RemotePath::from_string(&relative_key).unwrap(),
                            // LocalFs is just for testing
                            last_modified: SystemTime::now(),
-                            size: 0,
                        });
                    }
                }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
                        }
                    };

-                    let size = object.size.unwrap_or(0) as u64;
-
                    result.keys.push(ListingObject{
                        key,
-                        last_modified,
-                        size,
+                        last_modified
                    });
                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
        async_stream::stream! {
            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                .map_err(DownloadError::Other)?;
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
    // TODO: join these two?
    Tenant,
-    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    /// Should only be used e.g. for status check/tenant creation/list.
+    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    // Should only be used e.g. for status check/tenant creation/list.
    PageServerApi,
-    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    /// Should only be used e.g. for status check.
-    /// Currently also used for connection from any pageserver to any safekeeper.
+    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    // Should only be used e.g. for status check.
+    // Currently also used for connection from any pageserver to any safekeeper.
    SafekeeperData,
-    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    /// Allows access to control plane managment API and some storage controller endpoints.
+    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,

    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -356,6 +356,8 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

+    id: BuilderValue<NodeId>,
+
    broker_endpoint: BuilderValue<Uri>,
    broker_keepalive_interval: BuilderValue<Duration>,

@@ -404,8 +406,11 @@ struct PageServerConfigBuilder {
 }

 impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
    }

    #[inline(always)]
@@ -433,6 +438,7 @@ impl PageServerConfigBuilder {
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
+            id: NotSet,
            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                .parse()
                .expect("failed to parse default broker endpoint")),
@@ -562,6 +568,10 @@ impl PageServerConfigBuilder {
        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
    }

+    pub fn id(&mut self, node_id: NodeId) {
+        self.id = BuilderValue::Set(node_id)
+    }
+
    pub fn log_format(&mut self, log_format: LogFormat) {
        self.log_format = BuilderValue::Set(log_format)
    }
@@ -673,7 +683,7 @@ impl PageServerConfigBuilder {
        self.l0_flush = BuilderValue::Set(value);
    }

-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
+    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

        macro_rules! conf {
@@ -706,6 +716,7 @@ impl PageServerConfigBuilder {
                pg_auth_type,
                auth_validation_public_key_path,
                remote_storage_config,
+                id,
                broker_endpoint,
                broker_keepalive_interval,
                log_format,
@@ -733,7 +744,6 @@ impl PageServerConfigBuilder {
            }
            CUSTOM LOGIC
            {
-                id: id,
                // TenantConf is handled separately
                default_tenant_conf: TenantConf::default(),
                concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -883,7 +893,7 @@ impl PageServerConf {
        toml: &Document,
        workdir: &Utf8Path,
    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
+        let mut builder = PageServerConfigBuilder::new(node_id);
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -914,6 +924,8 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -1006,7 +1018,7 @@ impl PageServerConf {
            }
        }

-        let mut conf = builder.build(node_id).context("invalid config")?;
+        let mut conf = builder.build().context("invalid config")?;

        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
            let auth_validation_public_key_path = conf
@@ -1243,6 +1255,7 @@ max_file_descriptors = 333

 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
+id = 10

 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1259,8 +1272,9 @@ background_task_maximum_delay = '334 s'
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
+        let config_string = format!(
+            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
+        );
        let toml = config_string.parse()?;

        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1565,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1634,6 +1649,7 @@ threshold = "20m"
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2129,24 +2129,14 @@ async fn secondary_download_handler(

    let timeout = wait.unwrap_or(Duration::MAX);

-    let result = tokio::time::timeout(
+    let status = match tokio::time::timeout(
        timeout,
        state.secondary_controller.download_tenant(tenant_shard_id),
    )
-    .await;
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    let status = match result {
-        Ok(Ok(())) => {
-            if progress.layers_downloaded >= progress.layers_total {
-                // Download job ran to completion
-                StatusCode::OK
-            } else {
-                // Download dropped out without errors because it ran out of time budget
-                StatusCode::ACCEPTED
-            }
-        }
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
@@ -2156,6 +2146,8 @@ async fn secondary_download_handler(
        Err(_) => StatusCode::ACCEPTED,
    };

+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
    json_response(status, progress)
 }

--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,23 +2,13 @@ use std::{num::NonZeroUsize, sync::Arc};

 use crate::tenant::ephemeral_file;

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    #[default]
    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
-}
-
-impl Default for L0FlushConfig {
-    fn default() -> Self {
-        Self::Direct {
-            // TODO: using num_cpus results in different peak memory usage on different instance types.
-            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
-        }
-    }
+    Direct { max_concurrency: NonZeroUsize },
 }

 #[derive(Clone)]
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -613,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_total",
-        "Size of data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_considered",
-        "Size of potentially compressible data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_chosen",
-        "Size of data whose compressed form was written into image layers"
+        "Size of uncompressed data written into image layers"
    )
    .expect("failed to define a metric")
 });
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-#[derive(Copy, Clone, Debug)]
-pub struct CompressionInfo {
-    pub written_compressed: bool,
-    pub compressed_size: Option<usize>,
-}
-
 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        let (buf, res) = self
-            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await;
-        (buf, res.map(|(off, _compression_info)| off))
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
+    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;
-        let mut compression_info = CompressionInfo {
-            written_compressed: false,
-            compressed_size: None,
-        };

        let len = srcbuf.bytes_init();

@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        encoder.write_all(&slice[..]).await.unwrap();
                        encoder.shutdown().await.unwrap();
                        let compressed = encoder.into_inner();
-                        compression_info.compressed_size = Some(compressed.len());
                        if compressed.len() < len {
-                            compression_info.written_compressed = true;
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        } else {
            self.write_all(srcbuf, ctx).await
        };
-        (srcbuf, res.map(|_| (offset, compression_info)))
+        (srcbuf, res.map(|_| offset))
    }
 }

@@ -430,14 +416,12 @@ pub(crate) mod tests {
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
-                    let res = wtr
-                        .write_blob_maybe_compressed(
-                            blob.clone(),
-                            ctx,
-                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                        )
-                        .await;
-                    (res.0, res.1.map(|(off, _)| off))
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        ctx,
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                    )
+                    .await
                } else {
                    wtr.write_blob(blob.clone(), ctx).await
                };
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1384,32 +1384,34 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
    ) -> Result<(), DeleteTenantError> {
        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
+        let keys = match self
+            .resources
+            .remote_storage
+            .list(
+                Some(&remote_path),
+                remote_storage::ListingMode::NoDelimiter,
+                None,
+                &self.cancel,
+            )
+            .await
+        {
+            Ok(listing) => listing.keys,
+            Err(remote_storage::DownloadError::Cancelled) => {
+                return Err(DeleteTenantError::Cancelled)
            }
+            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+        };
+
+        if keys.is_empty() {
+            tracing::info!("Remote storage already deleted");
+        } else {
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+            self.resources
+                .remote_storage
+                .delete_objects(&keys, &self.cancel)
+                .await?;
        }

        Ok(())
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
            .write_blob_maybe_compressed(val, ctx, compression)
            .await;
        let off = match res {
-            Ok((off, _)) => off,
+            Ok(off) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
        };

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -734,14 +734,6 @@ struct ImageLayerWriterInner {
    // Total uncompressed bytes passed into put_image
    uncompressed_bytes: u64,

-    // Like `uncompressed_bytes`,
-    // but only of images we might consider for compression
-    uncompressed_bytes_eligible: u64,
-
-    // Like `uncompressed_bytes`, but only of images
-    // where we have chosen their compressed form
-    uncompressed_bytes_chosen: u64,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -798,8 +790,6 @@ impl ImageLayerWriterInner {
            tree: tree_builder,
            blob_writer,
            uncompressed_bytes: 0,
-            uncompressed_bytes_eligible: 0,
-            uncompressed_bytes_chosen: 0,
        };

        Ok(writer)
@@ -818,22 +808,13 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let compression = self.conf.image_compression;
-        let uncompressed_len = img.len() as u64;
-        self.uncompressed_bytes += uncompressed_len;
+        self.uncompressed_bytes += img.len() as u64;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
            .await;
        // TODO: re-use the buffer for `img` further upstack
-        let (off, compression_info) = res?;
-        if compression_info.compressed_size.is_some() {
-            // The image has been considered for compression at least
-            self.uncompressed_bytes_eligible += uncompressed_len;
-        }
-        if compression_info.written_compressed {
-            // The image has been compressed
-            self.uncompressed_bytes_chosen += uncompressed_len;
-        }
+        let off = res?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
@@ -856,9 +837,6 @@ impl ImageLayerWriterInner {
        // Calculate compression ratio
        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
-            .inc_by(self.uncompressed_bytes_eligible);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);

        let mut file = self.blob_writer.into_inner();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -58,7 +58,7 @@ use std::{
    sync::atomic::AtomicU64,
 };
 use std::{
-    cmp::{max, min},
+    cmp::{max, min, Ordering},
    ops::ControlFlow,
 };
 use std::{
@@ -177,6 +177,25 @@ impl std::fmt::Display for ImageLayerCreationMode {
    }
 }

+/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct Hole {
+    key_range: Range<Key>,
+    coverage_size: usize,
+}
+
+impl Ord for Hole {
+    fn cmp(&self, other: &Self) -> Ordering {
+        other.coverage_size.cmp(&self.coverage_size) // inverse order
+    }
+}
+
+impl PartialOrd for Hole {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
 fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -30,8 +30,8 @@ use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPA
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
-use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -608,93 +608,62 @@ impl Timeline {
            .read_lock_held_spawn_blocking_startup_micros
            .till_now();

-        // TODO: replace with streaming k-merge
-        let all_keys = {
-            let mut all_keys = Vec::new();
-            for l in deltas_to_compact.iter() {
-                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
-            }
-            // The current stdlib sorting implementation is designed in a way where it is
-            // particularly fast where the slice is made up of sorted sub-ranges.
-            all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-            all_keys
-        };
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+
+        let mut all_keys = Vec::new();
+
+        for l in deltas_to_compact.iter() {
+            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+        }
+
+        // FIXME: should spawn_blocking the rest of this function
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));

        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();

-        // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
-        //
-        // A hole is a key range for which this compaction doesn't have any WAL records.
-        // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
-        // cover the hole, but actually don't contain any WAL records for that key range.
-        // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
-        // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
-        //
-        // The algorithm chooses holes as follows.
-        // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
-        // - Filter: min threshold on range length
-        // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
-        //
-        // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
-        #[derive(PartialEq, Eq)]
-        struct Hole {
-            key_range: Range<Key>,
-            coverage_size: usize,
-        }
-        let holes: Vec<Hole> = {
-            use std::cmp::Ordering;
-            impl Ord for Hole {
-                fn cmp(&self, other: &Self) -> Ordering {
-                    self.coverage_size.cmp(&other.coverage_size).reverse()
-                }
-            }
-            impl PartialOrd for Hole {
-                fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-                    Some(self.cmp(other))
-                }
-            }
-            let max_holes = deltas_to_compact.len();
-            let last_record_lsn = self.get_last_record_lsn();
-            let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-            let min_hole_coverage_size = 3; // TODO: something more flexible?
-                                            // min-heap (reserve space for one more element added before eviction)
-            let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-            let mut prev: Option<Key> = None;
-
-            for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-                if let Some(prev_key) = prev {
-                    // just first fast filter, do not create hole entries for metadata keys. The last hole in the
-                    // compaction is the gap between data key and metadata keys.
-                    if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
-                        && !Key::is_metadata_key(&prev_key)
-                    {
-                        let key_range = prev_key..next_key;
-                        // Measuring hole by just subtraction of i128 representation of key range boundaries
-                        // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                        // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                        // That is why it is better to measure size of hole as number of covering image layers.
-                        let coverage_size =
-                            layers.image_coverage(&key_range, last_record_lsn).len();
-                        if coverage_size >= min_hole_coverage_size {
-                            heap.push(Hole {
-                                key_range,
-                                coverage_size,
-                            });
-                            if heap.len() > max_holes {
-                                heap.pop(); // remove smallest hole
-                            }
+        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+            if let Some(prev_key) = prev {
+                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                // compaction is the gap between data key and metadata keys.
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                    && !Key::is_metadata_key(&prev_key)
+                {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
                        }
                    }
                }
-                prev = Some(next_key.next());
            }
-            let mut holes = heap.into_vec();
-            holes.sort_unstable_by_key(|hole| hole.key_range.start);
-            holes
-        };
+            prev = Some(next_key.next());
+        }
        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
        drop_rlock(guard);
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
@@ -769,7 +738,6 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        let mut next_hole = 0; // index of next hole in holes vector

        for &DeltaEntry {
            key, lsn, ref val, ..
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,96 +870,6 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}

-[[package]]
-name = "clickhouse-connect"
-version = "0.7.17"
-description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
-optional = false
-python-versions = "~=3.8"
-files = [
-    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
-]
-
-[package.dependencies]
-certifi = "*"
-lz4 = "*"
-pytz = "*"
-urllib3 = ">=1.26"
-zstandard = "*"
-
-[package.extras]
-arrow = ["pyarrow"]
-numpy = ["numpy"]
-orjson = ["orjson"]
-pandas = ["pandas"]
-sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
-tzlocal = ["tzlocal (>=4.0)"]
-
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1560,56 +1470,6 @@ files = [
    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]

-[[package]]
-name = "lz4"
-version = "4.3.3"
-description = "LZ4 Bindings for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
-    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
-    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
-    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
-    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
-    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
-    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
-    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
-    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
-    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
-    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
-    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
-    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
-]
-
-[package.extras]
-docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
-flake8 = ["flake8"]
-tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
-
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2501,17 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"

-[[package]]
-name = "pytz"
-version = "2024.1"
-description = "World timezone definitions, modern and historical"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
-    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3357,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -5,6 +5,4 @@ pub use limit_algorithm::{
 };
 pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
 mod leaky_bucket;
-pub use leaky_bucket::{
-    EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
-};
+pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -1,6 +1,7 @@
 use std::{
    hash::Hash,
    sync::atomic::{AtomicUsize, Ordering},
+    time::Duration,
 };

 use ahash::RandomState;
@@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;

 pub struct LeakyBucketRateLimiter<Key> {
    map: DashMap<Key, LeakyBucketState, RandomState>,
-    config: LeakyBucketConfig,
+    config: LeakyBucketConfigInner,
    access_count: AtomicUsize,
 }

@@ -29,7 +30,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
    pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
        Self {
            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
-            config,
+            config: config.into(),
            access_count: AtomicUsize::new(0),
        }
    }
@@ -42,10 +43,10 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
            self.do_gc(now);
        }

-        let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
-            time: now,
-            filled: 0.0,
-        });
+        let mut entry = self
+            .map
+            .entry(key)
+            .or_insert_with(|| LeakyBucketState::new(now));

        entry.check(&self.config, now, n as f64)
    }
@@ -59,7 +60,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
        let shard = thread_rng().gen_range(0..n);
        self.map.shards()[shard]
            .write()
-            .retain(|_, value| !value.get_mut().update(&self.config, now));
+            .retain(|_, value| value.get().should_retain(now));
    }
 }

@@ -68,11 +69,6 @@ pub struct LeakyBucketConfig {
    pub max: f64,
 }

-pub struct LeakyBucketState {
-    filled: f64,
-    time: Instant,
-}
-
 impl LeakyBucketConfig {
    pub fn new(rps: f64, max: f64) -> Self {
        assert!(rps > 0.0, "rps must be positive");
@@ -81,40 +77,76 @@ impl LeakyBucketConfig {
    }
 }

-impl LeakyBucketState {
-    pub fn new() -> Self {
+struct LeakyBucketConfigInner {
+    /// "time cost" of a single request unit.
+    /// loosely represents how long it takes to handle a request unit in active CPU time.
+    time_cost: Duration,
+    bucket_width: Duration,
+}
+
+impl From<LeakyBucketConfig> for LeakyBucketConfigInner {
+    fn from(config: LeakyBucketConfig) -> Self {
+        // seconds_per_request = 1/(request_per_second)
+        let spr = config.rps.recip();
        Self {
-            filled: 0.0,
-            time: Instant::now(),
+            time_cost: Duration::from_secs_f64(spr),
+            bucket_width: Duration::from_secs_f64(config.max * spr),
        }
    }
-
-    /// updates the timer and returns true if the bucket is empty
-    fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
-        let drain = now.duration_since(self.time);
-        let drain = drain.as_secs_f64() * info.rps;
-
-        self.filled = (self.filled - drain).clamp(0.0, info.max);
-        self.time = now;
-
-        self.filled == 0.0
-    }
-
-    pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
-        self.update(info, now);
-
-        if self.filled + n > info.max {
-            return false;
-        }
-        self.filled += n;
-
-        true
-    }
 }

-impl Default for LeakyBucketState {
-    fn default() -> Self {
-        Self::new()
+struct LeakyBucketState {
+    /// Bucket is represented by `start..end` where `start = end - config.bucket_width`.
+    ///
+    /// At any given time, `end - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
+    /// Adding `n` tokens to the bucket is done by moving `end` forward by `n * config.time_cost`.
+    /// If `now < start`, the bucket is considered filled and cannot accept any more tokens.
+    /// Draining the bucket will happen naturally as `now` moves forward.
+    ///
+    /// Let `n` be some "time cost" for the request,
+    /// If now is after end, the bucket is empty and the end is reset to now,
+    /// If now is within the `bucket window + n`, we are within time budget.
+    /// If now is before the `bucket window + n`, we have run out of budget.
+    ///
+    /// This is inspired by the generic cell rate algorithm (GCRA) and works
+    /// exactly the same as a leaky-bucket.
+    end: Instant,
+}
+
+impl LeakyBucketState {
+    fn new(now: Instant) -> Self {
+        Self { end: now }
+    }
+
+    fn should_retain(&self, now: Instant) -> bool {
+        // if self.end is after now, the bucket is not empty
+        now < self.end
+    }
+
+    fn check(&mut self, config: &LeakyBucketConfigInner, now: Instant, n: f64) -> bool {
+        let start = self.end - config.bucket_width;
+
+        let n = config.time_cost.mul_f64(n);
+
+        //       start          end
+        //       |     start+n  |     end+n
+        //       |   /          |   /
+        // ------{o-[---------o-}--]----o----
+        //   now1 ^      now2 ^         ^ now3
+        //
+        // at now1, the bucket would be completely filled if we add n tokens.
+        // at now2, the bucket would be partially filled if we add n tokens.
+        // at now3, the bucket would start completely empty before we add n tokens.
+
+        if self.end + n <= now {
+            self.end = now + n;
+            true
+        } else if start + n <= now {
+            self.end += n;
+            true
+        } else {
+            false
+        }
    }
 }

@@ -124,47 +156,50 @@ mod tests {

    use tokio::time::Instant;

-    use super::{LeakyBucketConfig, LeakyBucketState};
+    use super::{LeakyBucketConfig, LeakyBucketConfigInner, LeakyBucketState};

    #[tokio::test(start_paused = true)]
    async fn check() {
-        let info = LeakyBucketConfig::new(500.0, 2000.0);
-        let mut bucket = LeakyBucketState::new();
+        let config: LeakyBucketConfigInner = LeakyBucketConfig::new(500.0, 2000.0).into();
+        assert_eq!(config.time_cost, Duration::from_millis(2));
+        assert_eq!(config.bucket_width, Duration::from_secs(4));
+
+        let mut bucket = LeakyBucketState::new(Instant::now());

        // should work for 2000 requests this second
        for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            assert!(bucket.check(&config, Instant::now(), 1.0));
        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-        assert_eq!(bucket.filled, 2000.0);
+        assert!(!bucket.check(&config, Instant::now(), 1.0));
+        assert_eq!(bucket.end - Instant::now(), config.bucket_width);

        // in 1ms we should drain 0.5 tokens.
        // make sure we don't lose any tokens
        tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert!(!bucket.check(&config, Instant::now(), 1.0));
        tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(bucket.check(&info, Instant::now(), 1.0));
+        assert!(bucket.check(&config, Instant::now(), 1.0));

        // in 10ms we should drain 5 tokens
        tokio::time::advance(Duration::from_millis(10)).await;
        for _ in 0..5 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            assert!(bucket.check(&config, Instant::now(), 1.0));
        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert!(!bucket.check(&config, Instant::now(), 1.0));

        // in 10s we should drain 5000 tokens
        // but cap is only 2000
        tokio::time::advance(Duration::from_secs(10)).await;
        for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            assert!(bucket.check(&config, Instant::now(), 1.0));
        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert!(!bucket.check(&config, Instant::now(), 1.0));

        // should sustain 500rps
        for _ in 0..2000 {
            tokio::time::advance(Duration::from_millis(10)).await;
            for _ in 0..5 {
-                assert!(bucket.check(&info, Instant::now(), 1.0));
+                assert!(bucket.check(&config, Instant::now(), 1.0));
            }
        }
    }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,6 @@ zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
-clickhouse-connect = "^0.7.16"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,7 +18,6 @@ anyhow.workspace = true
 aws-config.workspace = true
 bytes.workspace = true
 camino.workspace = true
-chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
@@ -45,17 +44,12 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true

-diesel = { version = "2.1.4", features = [
-    "serde_json",
-    "postgres",
-    "r2d2",
-    "chrono",
-] }
+diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
-dns-lookup = { version = "2.0.4" }

 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
@@ -1 +0,0 @@
-DROP TABLE metadata_health;
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
@@ -1,14 +0,0 @@
-CREATE TABLE metadata_health (
-  tenant_id VARCHAR NOT NULL,
-  shard_number INTEGER NOT NULL,
-  shard_count INTEGER NOT NULL,
-  PRIMARY KEY(tenant_id, shard_number, shard_count),
-  -- Rely on cascade behavior for delete
-  FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
-  healthy BOOLEAN NOT NULL DEFAULT TRUE,
-  last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
-);
-
-
-INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
-SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
--- a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
@@ -1 +0,0 @@
-DROP TABLE leader;
--- a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
@@ -1,6 +0,0 @@
-CREATE TABLE leader (
-  hostname VARCHAR NOT NULL,
-  port INTEGER NOT NULL,
-  started_at TIMESTAMPTZ NOT NULL,
-  PRIMARY KEY(hostname, port, started_at)
-);
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,11 +10,7 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
-use pageserver_api::controller_api::{
-    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
-    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
-};
+use pageserver_api::controller_api::TenantCreateRequest;
 use pageserver_api::models::{
    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -564,51 +560,6 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Scrubber)?;
-
-    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    state.service.metadata_health_update(update_req).await?;
-
-    json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
-}
-
-async fn handle_metadata_health_list_unhealthy(
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
-
-    json_response(
-        StatusCode::OK,
-        MetadataHealthListUnhealthyResponse {
-            unhealthy_tenant_shards,
-        },
-    )
-}
-
-async fn handle_metadata_health_list_outdated(
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
-    let state = get_state(&req);
-    let health_records = state
-        .service
-        .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
-        .await?;
-
-    json_response(
-        StatusCode::OK,
-        MetadataHealthListOutdatedResponse { health_records },
-    )
-}
-
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -1036,28 +987,6 @@ pub fn make_router(
                RequestName("control_v1_cancel_node_fill"),
            )
        })
-        // Metadata health operations
-        .post("/control/v1/metadata_health/update", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_update,
-                RequestName("control_v1_metadata_health_update"),
-            )
-        })
-        .get("/control/v1/metadata_health/unhealthy", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_list_unhealthy,
-                RequestName("control_v1_metadata_health_list_unhealthy"),
-            )
-        })
-        .post("/control/v1/metadata_health/outdated", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_list_outdated,
-                RequestName("control_v1_metadata_health_list_outdated"),
-            )
-        })
        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -10,7 +10,6 @@ mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
-mod peer_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -81,9 +81,6 @@ struct Cli {
    #[arg(long, default_value = "5s")]
    db_connect_timeout: humantime::Duration,

-    #[arg(long, default_value = "false")]
-    start_as_candidate: bool,
-
    /// `neon_local` sets this to the path of the neon_local repo dir.
    /// Only relevant for testing.
    // TODO: make `cfg(feature = "testing")`
@@ -276,8 +273,6 @@ async fn async_main() -> anyhow::Result<()> {
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
        neon_local_repo_dir: args.neon_local_repo_dir,
-        start_as_candidate: args.start_as_candidate,
-        http_service_port: args.listen.port() as i32,
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,104 +0,0 @@
-use crate::tenant_shard::ObservedState;
-use pageserver_api::shard::TenantShardId;
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use tokio_util::sync::CancellationToken;
-
-use reqwest::{StatusCode, Url};
-use utils::{backoff, http::error::HttpErrorBody};
-
-#[derive(Debug, Clone)]
-pub(crate) struct PeerClient {
-    hostname: String,
-    port: i32,
-    jwt: Option<String>,
-    client: reqwest::Client,
-}
-
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum StorageControllerPeerError {
-    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
-    DeserializationError(StatusCode, Url, reqwest::Error),
-    #[error("storage controller peer API error ({0}): {1}")]
-    ApiError(StatusCode, String),
-    #[error("failed to send HTTP request: {0}")]
-    SendError(reqwest::Error),
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
-
-pub(crate) trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
-}
-
-impl ResponseErrorMessageExt for reqwest::Response {
-    async fn error_from_body(self) -> Result<Self> {
-        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            return Ok(self);
-        }
-
-        let url = self.url().to_owned();
-        Err(match self.json::<HttpErrorBody>().await {
-            Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
-            Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
-        })
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
-
-impl PeerClient {
-    pub(crate) fn new(hostname: String, port: i32, jwt: Option<String>) -> Self {
-        Self {
-            hostname,
-            port,
-            jwt,
-            client: reqwest::Client::new(),
-        }
-    }
-
-    async fn request_step_down(&self) -> Result<GlobalObservedState> {
-        let uri = format!("{}:{}/control/v1/step_down", self.hostname, self.port);
-        let req = self.client.put(uri);
-        let req = if let Some(jwt) = &self.jwt {
-            req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
-        } else {
-            req
-        };
-
-        let res = req
-            .send()
-            .await
-            .map_err(StorageControllerPeerError::SendError)?;
-        let response = res.error_from_body().await?;
-
-        let status = response.status();
-        let url = response.url().to_owned();
-
-        response
-            .json()
-            .await
-            .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
-    }
-
-    pub(crate) async fn step_down(
-        &self,
-        cancel: &CancellationToken,
-    ) -> Result<GlobalObservedState> {
-        backoff::retry(
-            || self.request_step_down(),
-            |_e| false,
-            4,
-            8,
-            "Send step down request",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| StorageControllerPeerError::Cancelled)
-        .and_then(|x| x)
-    }
-}
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,7 +8,6 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -91,12 +90,6 @@ pub(crate) enum DatabaseOperation {
    UpdateTenantShard,
    DeleteTenant,
    UpdateTenantConfig,
-    UpdateMetadataHealth,
-    ListMetadataHealth,
-    ListMetadataHealthUnhealthy,
-    ListMetadataHealthOutdated,
-    GetLeader,
-    UpdateLeader,
 }

 #[must_use]
@@ -314,32 +307,15 @@ impl Persistence {
        &self,
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health;
-        use crate::schema::tenant_shards;
-
-        let now = chrono::Utc::now();
-
-        let metadata_health_records = shards
-            .iter()
-            .map(|t| MetadataHealthPersistence {
-                tenant_id: t.tenant_id.clone(),
-                shard_number: t.shard_number,
-                shard_count: t.shard_count,
-                healthy: true,
-                last_scrubbed_at: now,
-            })
-            .collect::<Vec<_>>();
-
+        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::InsertTenantShards,
            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
-
-                diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
            },
        )
@@ -353,10 +329,10 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::DeleteTenant,
            move |conn| -> DatabaseResult<()> {
-                // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                diesel::delete(tenant_shards)
                    .filter(tenant_id.eq(del_tenant_id.to_string()))
                    .execute(conn)?;
+
                Ok(())
            },
        )
@@ -699,159 +675,6 @@ impl Persistence {
        )
        .await
    }
-
-    /// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
-    ///
-    /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
-    #[allow(dead_code)]
-    pub(crate) async fn update_metadata_health_records(
-        &self,
-        healthy_records: Vec<MetadataHealthPersistence>,
-        unhealthy_records: Vec<MetadataHealthPersistence>,
-        now: chrono::DateTime<chrono::Utc>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health::dsl::*;
-
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
-                    .on_conflict((tenant_id, shard_number, shard_count))
-                    .do_update()
-                    .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
-
-                diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
-                    .on_conflict((tenant_id, shard_number, shard_count))
-                    .do_update()
-                    .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records.
-    #[allow(dead_code)]
-    pub(crate) async fn list_metadata_health_records(
-        &self,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records that is unhealthy.
-    #[allow(dead_code)]
-    pub(crate) async fn list_unhealthy_metadata_health_records(
-        &self,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        use crate::schema::metadata_health::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records that have not been updated since an `earlier` time.
-    #[allow(dead_code)]
-    pub(crate) async fn list_outdated_metadata_health_records(
-        &self,
-        earlier: chrono::DateTime<chrono::Utc>,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        use crate::schema::metadata_health::dsl::*;
-
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
-                let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
-
-                Ok(res)
-            },
-        )
-        .await
-    }
-
-    /// Get the current entry from the `leader` table if one exists.
-    /// It is an error for the table to contain more than one entry.
-    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<LeaderPersistence>> {
-        let mut leader: Vec<LeaderPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::GetLeader,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::leader::table.load::<LeaderPersistence>(conn)?)
-                },
-            )
-            .await?;
-
-        if leader.len() > 1 {
-            return Err(DatabaseError::Logical(format!(
-                "More than one entry present in the leader table: {leader:?}"
-            )));
-        }
-
-        Ok(leader.pop())
-    }
-
-    /// Update the new leader with compare-exchange semantics. If `prev` does not
-    /// match the current leader entry, then the update is treated as a failure.
-    /// When `prev` is not specified, the update is forced.
-    pub(crate) async fn update_leader(
-        &self,
-        prev: Option<LeaderPersistence>,
-        new: LeaderPersistence,
-    ) -> DatabaseResult<()> {
-        use crate::schema::leader::dsl::*;
-
-        let updated = self
-            .with_measured_conn(
-                DatabaseOperation::UpdateLeader,
-                move |conn| -> DatabaseResult<usize> {
-                    let updated = match &prev {
-                        Some(prev) => diesel::update(leader)
-                            .filter(hostname.eq(prev.hostname.clone()))
-                            .filter(port.eq(prev.port))
-                            .filter(started_at.eq(prev.started_at))
-                            .set((
-                                hostname.eq(new.hostname.clone()),
-                                port.eq(new.port),
-                                started_at.eq(new.started_at),
-                            ))
-                            .execute(conn)?,
-                        None => diesel::insert_into(leader)
-                            .values(new.clone())
-                            .execute(conn)?,
-                    };
-
-                    Ok(updated)
-                },
-            )
-            .await?;
-
-        if updated == 0 {
-            return Err(DatabaseError::Logical(
-                "Leader table update failed".to_string(),
-            ));
-        }
-
-        Ok(())
-    }
 }

 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -921,69 +744,3 @@ pub(crate) struct NodePersistence {
    pub(crate) listen_pg_addr: String,
    pub(crate) listen_pg_port: i32,
 }
-
-/// Tenant metadata health status that are stored durably.
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
-#[diesel(table_name = crate::schema::metadata_health)]
-pub(crate) struct MetadataHealthPersistence {
-    #[serde(default)]
-    pub(crate) tenant_id: String,
-    #[serde(default)]
-    pub(crate) shard_number: i32,
-    #[serde(default)]
-    pub(crate) shard_count: i32,
-
-    pub(crate) healthy: bool,
-    pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-impl MetadataHealthPersistence {
-    pub fn new(
-        tenant_shard_id: TenantShardId,
-        healthy: bool,
-        last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-    ) -> Self {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_number = tenant_shard_id.shard_number.0 as i32;
-        let shard_count = tenant_shard_id.shard_count.literal() as i32;
-
-        MetadataHealthPersistence {
-            tenant_id,
-            shard_number,
-            shard_count,
-            healthy,
-            last_scrubbed_at,
-        }
-    }
-
-    #[allow(dead_code)]
-    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
-        Ok(TenantShardId {
-            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
-            shard_number: ShardNumber(self.shard_number as u8),
-            shard_count: ShardCount::new(self.shard_count as u8),
-        })
-    }
-}
-
-impl From<MetadataHealthPersistence> for MetadataHealthRecord {
-    fn from(value: MetadataHealthPersistence) -> Self {
-        MetadataHealthRecord {
-            tenant_shard_id: value
-                .get_tenant_shard_id()
-                .expect("stored tenant id should be valid"),
-            healthy: value.healthy,
-            last_scrubbed_at: value.last_scrubbed_at,
-        }
-    }
-}
-
-#[derive(
-    Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
-)]
-#[diesel(table_name = crate::schema::leader)]
-pub(crate) struct LeaderPersistence {
-    pub(crate) hostname: String,
-    pub(crate) port: i32,
-    pub(crate) started_at: chrono::DateTime<chrono::Utc>,
-}
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,23 +1,5 @@
 // @generated automatically by Diesel CLI.

-diesel::table! {
-    leader (hostname, port, started_at) {
-        hostname -> Varchar,
-        port -> Int4,
-        started_at -> Timestamptz,
-    }
-}
-
-diesel::table! {
-    metadata_health (tenant_id, shard_number, shard_count) {
-        tenant_id -> Varchar,
-        shard_number -> Int4,
-        shard_count -> Int4,
-        healthy -> Bool,
-        last_scrubbed_at -> Timestamptz,
-    }
-}
-
 diesel::table! {
    nodes (node_id) {
        node_id -> Int8,
@@ -44,4 +26,4 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(leader, metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,10 +16,7 @@ use crate::{
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    metrics::LeadershipStatusGroup,
-    peer_client::{GlobalObservedState, PeerClient},
-    persistence::{
-        AbortShardSplitStatus, LeaderPersistence, MetadataHealthPersistence, TenantFilter,
-    },
+    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
@@ -36,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -85,6 +82,7 @@ use crate::{
        ReconcilerWaiter, TenantShard,
    },
 };
+use serde::{Deserialize, Serialize};

 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -225,7 +223,6 @@ impl ServiceState {
        tenants: BTreeMap<TenantShardId, TenantShard>,
        scheduler: Scheduler,
        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
-        initial_leadership_status: LeadershipStatus,
    ) -> Self {
        let status = &crate::metrics::METRICS_REGISTRY
            .metrics_group
@@ -233,13 +230,15 @@ impl ServiceState {

        status.set(
            LeadershipStatusGroup {
-                status: initial_leadership_status,
+                status: LeadershipStatus::Leader,
            },
            1,
        );

        Self {
-            leadership_status: initial_leadership_status,
+            // TODO: Starting up as Leader is a transient state. Once we enable rolling
+            // upgrades on the k8s side, we should start up as Candidate.
+            leadership_status: LeadershipStatus::Leader,
            tenants,
            nodes: Arc::new(nodes),
            scheduler,
@@ -288,33 +287,6 @@ impl ServiceState {
            0,
        );
    }
-
-    fn become_leader(&mut self) {
-        self.leadership_status = LeadershipStatus::Leader;
-
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            1,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::SteppedDown,
-            },
-            0,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Candidate,
-            },
-            0,
-        );
-    }
 }

 #[derive(Clone)]
@@ -351,10 +323,6 @@ pub struct Config {

    // TODO: make this cfg(feature  = "testing")
    pub neon_local_repo_dir: Option<PathBuf>,
-
-    pub start_as_candidate: bool,
-
-    pub http_service_port: i32,
 }

 impl From<DatabaseError> for ApiError {
@@ -522,10 +490,9 @@ pub(crate) enum ReconcileResultRequest {
    Stop,
 }

-struct LeaderStepDownState {
-    observed: GlobalObservedState,
-    leader: LeaderPersistence,
-}
+// TODO: move this into the storcon peer client when that gets added
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);

 impl Service {
    pub fn get_config(&self) -> &Config {
@@ -537,11 +504,15 @@ impl Service {
    #[instrument(skip_all)]
    async fn startup_reconcile(
        self: &Arc<Service>,
-        leader_step_down_state: Option<LeaderStepDownState>,
        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
            Result<(), (TenantShardId, NotifyError)>,
        >,
    ) {
+        // For all tenant shards, a vector of observed states on nodes (where None means
+        // indeterminate, same as in [`ObservedStateLocation`])
+        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
+            HashMap::new();
+
        // Startup reconciliation does I/O to other services: whether they
        // are responsive or not, we should aim to finish within our deadline, because:
        // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -555,29 +526,26 @@ impl Service {
            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
            .expect("Reconcile timeout is a modest constant");

-        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
-            tracing::info!(
-                "Using observed received from leader at {}:{}",
-                state.leader.hostname,
-                state.leader.port
-            );
-            (state.observed, Some(state.leader))
-        } else {
-            (
-                self.build_global_observed_state(node_scan_deadline).await,
-                None,
-            )
-        };
-
        // Accumulate a list of any tenant locations that ought to be detached
        let mut cleanup = Vec::new();

-        // Send initial heartbeat requests to all nodes loaded from the database
-        let all_nodes = {
-            let locked = self.inner.read().unwrap();
-            locked.nodes.clone()
-        };
-        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
+        let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        // Send initial heartbeat requests to nodes that replied to the location listing above.
+        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
+
+        for (node_id, list_response) in node_listings {
+            let tenant_shards = list_response.tenant_shards;
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}, setting it to Active",
+                tenant_shards.len(),
+                node_id
+            );
+
+            for (tenant_shard_id, conf_opt) in tenant_shards {
+                let shard_observations = observed.entry(tenant_shard_id).or_default();
+                shard_observations.push((node_id, conf_opt));
+            }
+        }

        // List of tenants for which we will attempt to notify compute of their location at startup
        let mut compute_notifications = Vec::new();
@@ -600,16 +568,17 @@ impl Service {
            }
            *nodes = Arc::new(new_nodes);

-            for (tenant_shard_id, observed_state) in observed.0 {
-                let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
-                    for node_id in observed_state.locations.keys() {
-                        cleanup.push((tenant_shard_id, *node_id));
-                    }
-
-                    continue;
-                };
-
-                tenant_shard.observed = observed_state;
+            for (tenant_shard_id, shard_observations) in observed {
+                for (node_id, observed_loc) in shard_observations {
+                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
+                        cleanup.push((tenant_shard_id, node_id));
+                        continue;
+                    };
+                    tenant_shard
+                        .observed
+                        .locations
+                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
+                }
            }

            // Populate each tenant's intent state
@@ -643,22 +612,6 @@ impl Service {
            tenants.len()
        };

-        // Before making any obeservable changes to the cluster, persist self
-        // as leader in database and memory.
-
-        let proposed_leader = self.get_proposed_leader_info();
-
-        if let Err(err) = self
-            .persistence
-            .update_leader(current_leader, proposed_leader)
-            .await
-        {
-            tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
-            std::process::exit(1);
-        }
-
-        self.inner.write().unwrap().become_leader();
-
        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
        // generation_pageserver in the database.

@@ -824,31 +777,6 @@ impl Service {
        node_results
    }

-    async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
-        let node_listings = self.scan_node_locations(deadline).await;
-        let mut observed = GlobalObservedState::default();
-
-        for (node_id, location_confs) in node_listings {
-            tracing::info!(
-                "Received {} shard statuses from pageserver {}",
-                location_confs.tenant_shards.len(),
-                node_id
-            );
-
-            for (tid, location_conf) in location_confs.tenant_shards {
-                let entry = observed.0.entry(tid).or_default();
-                entry.locations.insert(
-                    node_id,
-                    ObservedStateLocation {
-                        conf: location_conf,
-                    },
-                );
-            }
-        }
-
-        observed
-    }
-
    /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
    ///
    /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
@@ -1327,20 +1255,12 @@ impl Service {
            config.max_warming_up_interval,
            cancel.clone(),
        );
-
-        let initial_leadership_status = if config.start_as_candidate {
-            LeadershipStatus::Candidate
-        } else {
-            LeadershipStatus::Leader
-        };
-
        let this = Arc::new(Self {
            inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                nodes,
                tenants,
                scheduler,
                delayed_reconcile_rx,
-                initial_leadership_status,
            ))),
            config: config.clone(),
            persistence,
@@ -1409,16 +1329,7 @@ impl Service {
                    return;
                };

-                let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let peer_observed_state = match leadership_status {
-                    LeadershipStatus::Candidate => this.request_step_down().await,
-                    LeadershipStatus::Leader => None,
-                    LeadershipStatus::SteppedDown => unreachable!(),
-                };
-
-                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
-                    .await;
-
+                this.startup_reconcile(bg_compute_notify_result_tx).await;
                drop(startup_completion);
            }
        });
@@ -6184,68 +6095,6 @@ impl Service {
        Ok(())
    }

-    /// Updates scrubber metadata health check results.
-    pub(crate) async fn metadata_health_update(
-        &self,
-        update_req: MetadataHealthUpdateRequest,
-    ) -> Result<(), ApiError> {
-        let now = chrono::offset::Utc::now();
-        let (healthy_records, unhealthy_records) = {
-            let locked = self.inner.read().unwrap();
-            let healthy_records = update_req
-                .healthy_tenant_shards
-                .into_iter()
-                // Retain only health records associated with tenant shards managed by storage controller.
-                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
-                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
-                .collect();
-            let unhealthy_records = update_req
-                .unhealthy_tenant_shards
-                .into_iter()
-                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
-                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
-                .collect();
-
-            (healthy_records, unhealthy_records)
-        };
-
-        self.persistence
-            .update_metadata_health_records(healthy_records, unhealthy_records, now)
-            .await?;
-        Ok(())
-    }
-
-    /// Lists the tenant shards that has unhealthy metadata status.
-    pub(crate) async fn metadata_health_list_unhealthy(
-        &self,
-    ) -> Result<Vec<TenantShardId>, ApiError> {
-        let result = self
-            .persistence
-            .list_unhealthy_metadata_health_records()
-            .await?
-            .iter()
-            .map(|p| p.get_tenant_shard_id().unwrap())
-            .collect();
-
-        Ok(result)
-    }
-
-    /// Lists the tenant shards that have not been scrubbed for some duration.
-    pub(crate) async fn metadata_health_list_outdated(
-        &self,
-        not_scrubbed_for: Duration,
-    ) -> Result<Vec<MetadataHealthRecord>, ApiError> {
-        let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
-        let result = self
-            .persistence
-            .list_outdated_metadata_health_records(earlier)
-            .await?
-            .into_iter()
-            .map(|record| record.into())
-            .collect();
-        Ok(result)
-    }
-
    pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
        self.inner.read().unwrap().get_leadership_status()
    }
@@ -6268,88 +6117,4 @@ impl Service {

        global_observed
    }
-
-    /// Collect the details for the current proccess wishing to become the storage controller
-    /// leader.
-    ///
-    /// On failures to discover and resolve the hostname the process is killed and we rely on k8s to retry.
-    fn get_proposed_leader_info(&self) -> LeaderPersistence {
-        let hostname = match dns_lookup::get_hostname() {
-            Ok(name) => name,
-            Err(err) => {
-                tracing::error!("Failed to discover hostname: {err}. Aborting start-up ...");
-                std::process::exit(1);
-            }
-        };
-
-        let mut addrs = match dns_lookup::lookup_host(&hostname) {
-            Ok(addrs) => addrs,
-            Err(err) => {
-                tracing::error!("Failed to resolve hostname: {err}. Aborting start-up ...");
-                std::process::exit(1);
-            }
-        };
-
-        let addr = addrs
-            .pop()
-            .expect("k8s configured hostname always resolves");
-
-        let proposed = LeaderPersistence {
-            hostname: addr.to_string(),
-            port: self.get_config().http_service_port,
-            started_at: chrono::Utc::now(),
-        };
-
-        tracing::info!("Proposed leader details are: {proposed:?}");
-
-        proposed
-    }
-
-    /// Request step down from the currently registered leader in the database
-    ///
-    /// If such an entry is persisted, the success path returns the observed
-    /// state and details of the leader. Otherwise, None is returned indicating
-    /// there is no leader currently.
-    ///
-    /// On failures to query the database or step down error responses the process is killed
-    /// and we rely on k8s to retry.
-    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
-        let leader = match self.persistence.get_leader().await {
-            Ok(leader) => leader,
-            Err(err) => {
-                tracing::error!(
-                    "Failed to query database for current leader: {err}. Aborting start-up ..."
-                );
-                std::process::exit(1);
-            }
-        };
-
-        match leader {
-            Some(leader) => {
-                // TODO: jwt token
-                let client = PeerClient::new(
-                    leader.hostname.to_owned(),
-                    leader.port,
-                    self.config.jwt_token.clone(),
-                );
-                let state = client.step_down(&self.cancel).await;
-                match state {
-                    Ok(state) => Some(LeaderStepDownState {
-                        observed: state,
-                        leader: leader.clone(),
-                    }),
-                    Err(err) => {
-                        tracing::error!(
-                            "Leader ({}:{}) did not respond to step-down request: {}",
-                            leader.hostname,
-                            leader.port,
-                            err
-                        );
-                        None
-                    }
-                }
-            }
-            None => None,
-        }
-    }
 }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,11 +40,6 @@ impl TimelineAnalysis {
            garbage_keys: Vec::new(),
        }
    }
-
-    /// Whether a timeline is healthy.
-    pub(crate) fn is_healthy(&self) -> bool {
-        self.errors.is_empty() && self.warnings.is_empty()
-    }
 }

 pub(crate) async fn branch_cleanup_and_check_errors(
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,13 +1,10 @@
-use std::pin::pin;
-
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
-use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};

 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
-    stream_objects_with_retries, BucketConfig, NodeKind,
+    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    metadata_stream::stream_tenants, BucketConfig, NodeKind,
 };

 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -50,38 +47,45 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));

    let objects_stream = tenants.map_ok(|tenant_shard_id| {
        let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        let remote_client = remote_client.clone();
+        let s3_client = s3_client.clone();
        async move {
            let mut objects = Vec::new();
            let mut total_objects_ctr = 0u64;
            // We want the objects and not just common prefixes
            tenant_root.delimiter.clear();
-            let mut objects_stream = pin!(stream_objects_with_retries(
-                &remote_client,
-                ListingMode::NoDelimiter,
-                &tenant_root
-            ));
-            while let Some(listing) = objects_stream.next().await {
-                let listing = listing?;
-                for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
-                    let key = obj.key.to_string();
+            let mut continuation_token = None;
+            loop {
+                let fetch_response =
+                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                        .await?;
+                for obj in fetch_response.contents().iter().filter(|o| {
+                    if let Some(obj_size) = o.size {
+                        min_size as i64 <= obj_size
+                    } else {
+                        false
+                    }
+                }) {
+                    let key = obj.key().expect("couldn't get key").to_owned();
                    let kind = LargeObjectKind::from_key(&key);
                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
                        continue;
                    }
                    objects.push(LargeObject {
                        key,
-                        size: obj.size,
+                        size: obj.size.unwrap() as u64,
                        kind,
                    })
                }
-                total_objects_ctr += listing.keys.len() as u64;
+                total_objects_ctr += fetch_response.contents().len() as u64;
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
            }

            Ok((tenant_shard_id, objects, total_objects_ctr))
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -5,7 +5,6 @@
 use std::{
    collections::{HashMap, HashSet},
    sync::Arc,
-    time::Duration,
 };

 use anyhow::Context;
@@ -19,7 +18,7 @@ use utils::id::TenantId;

 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic, list_objects_with_retries,
+    init_remote, init_remote_generic,
    metadata_stream::{stream_tenant_timelines, stream_tenants},
    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
@@ -28,11 +27,6 @@ use crate::{
 enum GarbageReason {
    DeletedInConsole,
    MissingInConsole,
-
-    // The remaining data relates to a known deletion issue, and we're sure that purging this
-    // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
-    // there is nothing in a tenant path apart from a heatmap file.
-    KnownBug,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -78,15 +72,6 @@ impl GarbageList {
        }
    }

-    /// If an entity has been identified as requiring purge due to a known bug, e.g.
-    /// a particular type of object left behind after an incomplete deletion.
-    fn append_buggy(&mut self, entity: GarbageEntity) {
-        self.items.push(GarbageItem {
-            entity,
-            reason: GarbageReason::KnownBug,
-        });
-    }
-
    /// Return true if appended, false if not.  False means the result was not garbage.
    fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
    where
@@ -234,71 +219,6 @@ async fn find_garbage_inner(
            assert!(project.tenant == tenant_shard_id.tenant_id);
        }

-        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
-        // identify it as purge-able anyway
-        if console_result.is_none() {
-            let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
-                .await?
-                .collect::<Vec<_>>()
-                .await;
-            if timelines.is_empty() {
-                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries(
-                    &s3_client,
-                    &target.tenant_root(&tenant_shard_id),
-                    None,
-                )
-                .await?;
-                let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
-                if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
-                } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
-                }
-            } else {
-                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
-                // rollout of WAL DR in which we never deleted these.
-                let mut any_non_initdb = false;
-
-                for timeline_r in timelines {
-                    let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries(
-                        &s3_client,
-                        &target.timeline_root(&timeline),
-                        None,
-                    )
-                    .await?;
-                    if timeline_objects
-                        .common_prefixes
-                        .as_ref()
-                        .map(|v| v.len())
-                        .unwrap_or(0)
-                        > 0
-                    {
-                        // Sub-paths?  Unexpected
-                        any_non_initdb = true;
-                    } else {
-                        let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
-                        if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
-                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
-                        } else {
-                            any_non_initdb = true;
-                        }
-                    }
-                }
-
-                if any_non_initdb {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
-                } else {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
-                }
-            }
-        }
-
        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
            tracing::debug!("Tenant {tenant_shard_id} is garbage");
        } else {
@@ -429,6 +349,9 @@ pub async fn get_timeline_objects(
    tracing::debug!("Listing objects in timeline {ttid}");
    let timeline_root = super::remote_timeline_path_id(&ttid);

+    // TODO: apply extra validation based on object modification time.  Don't purge
+    // timelines whose index_part.json has been touched recently.
+
    let list = s3_client
        .list(
            Some(&timeline_root),
@@ -499,7 +422,6 @@ impl DeletionProgressTracker {
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
-    min_age: Duration,
    dry_run: bool,
 ) -> anyhow::Result<()> {
    let list_bytes = tokio::fs::read(&input_path).await?;
@@ -510,7 +432,7 @@ pub async fn purge_garbage(
        input_path
    );

-    let (remote_client, _target) =
+    let remote_client =
        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;

    assert_eq!(
@@ -537,7 +459,6 @@ pub async fn purge_garbage(
        .filter(|i| match (&mode, &i.reason) {
            (PurgeMode::DeletedAndMissing, _) => true,
            (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
-            (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
            (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
        });

@@ -566,37 +487,6 @@ pub async fn purge_garbage(
    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
-
-        // Extra safety check: even if a collection of objects is garbage, check max() of modification
-        // times before purging, so that if we incorrectly marked a live tenant as garbage then we would
-        // notice that its index has been written recently and would omit deleting it.
-        if object_list.is_empty() {
-            // Simplify subsequent code by ensuring list always has at least one item
-            // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
-            continue;
-        }
-        let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
-        let age = max_mtime.elapsed();
-        match age {
-            Err(_) => {
-                tracing::warn!("Bad last_modified time");
-                continue;
-            }
-            Ok(a) if a < min_age => {
-                // Failed age check.  This doesn't mean we did something wrong: a tenant might really be garbage and recently
-                // written, but out of an abundance of caution we still don't purge it.
-                tracing::info!(
-                    "Skipping tenant with young objects {}..{}",
-                    object_list.first().as_ref().unwrap().key,
-                    object_list.last().as_ref().unwrap().key
-                );
-                continue;
-            }
-            Ok(_) => {
-                // Passed age check
-            }
-        }
-
        objects_to_delete.append(&mut object_list);
        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
            do_delete(
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -22,19 +22,16 @@ use aws_sdk_s3::Client;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
-use futures::{Stream, StreamExt};
 use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
-    GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
-use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -256,12 +253,6 @@ pub struct ControllerClientConfig {
    pub controller_jwt: String,
 }

-impl ControllerClientConfig {
-    pub fn build_client(self) -> control_api::Client {
-        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
-    }
-}
-
 pub struct ConsoleConfig {
    pub token: String,
    pub base_url: Url,
@@ -328,35 +319,27 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
    }
 }

-fn make_root_target(
-    bucket_name: String,
-    prefix_in_bucket: String,
-    node_kind: NodeKind,
-) -> RootTarget {
-    let s3_target = S3Target {
-        bucket_name,
-        prefix_in_bucket,
-        delimiter: "/".to_string(),
-    };
-    match node_kind {
-        NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
-        NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
-    }
-}
-
 async fn init_remote(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
    let bucket_region = Region::new(bucket_config.region);
+    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_region).await);
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();

-    let s3_root = make_root_target(
-        bucket_config.bucket,
-        bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-        node_kind,
-    );
+    let s3_root = match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            delimiter,
+        }),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            delimiter,
+        }),
+    };

    Ok((s3_client, s3_root))
 }
@@ -364,12 +347,12 @@ async fn init_remote(
 async fn init_remote_generic(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
-) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
+) -> anyhow::Result<GenericRemoteStorage> {
    let endpoint = env::var("AWS_ENDPOINT_URL").ok();
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
    let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
    let storage = S3Config {
-        bucket_name: bucket_config.bucket.clone(),
+        bucket_name: bucket_config.bucket,
        bucket_region: bucket_config.region,
        prefix_in_bucket,
        endpoint,
@@ -383,13 +366,7 @@ async fn init_remote_generic(
        storage: RemoteStorageKind::AwsS3(storage),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
-
-    // We already pass the prefix to the remote client above
-    let prefix_in_root_target = String::new();
-    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
-
-    let client = GenericRemoteStorage::from_config(&storage_config).await?;
-    Ok((client, s3_root))
+    GenericRemoteStorage::from_config(&storage_config).await
 }

 async fn list_objects_with_retries(
@@ -427,44 +404,6 @@ async fn list_objects_with_retries(
    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }

-fn stream_objects_with_retries<'a>(
-    storage_client: &'a GenericRemoteStorage,
-    listing_mode: ListingMode,
-    s3_target: &'a S3Target,
-) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
-    async_stream::stream! {
-        let mut trial = 0;
-        let cancel = CancellationToken::new();
-        let prefix_str = &s3_target
-            .prefix_in_bucket
-            .strip_prefix("/")
-            .unwrap_or(&s3_target.prefix_in_bucket);
-        let prefix = RemotePath::from_string(prefix_str)?;
-        let mut list_stream =
-            storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
-        while let Some(res) = list_stream.next().await {
-            if let Err(err) = res {
-                let yield_err = if err.is_permanent() {
-                    true
-                } else {
-                    let backoff_time = 1 << trial.max(5);
-                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-                    trial += 1;
-                    trial == MAX_RETRIES - 1
-                };
-                if yield_err {
-                    yield Err(err)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                    break;
-                }
-            } else {
-                trial = 0;
-                yield res.map_err(anyhow::Error::from);
-            }
-        }
-    }
-}
-
 async fn download_object_with_retries(
    s3_client: &Client,
    bucket_name: &str,
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,8 +1,7 @@
 use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
-use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::{Method, Url};
+use reqwest::Url;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -51,8 +50,6 @@ enum Command {
        input_path: String,
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
-        #[arg(long = "min-age")]
-        min_age: humantime::Duration,
    },
    #[command(verbatim_doc_comment)]
    ScanMetadata {
@@ -62,8 +59,6 @@ enum Command {
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
-        #[arg(long = "post", default_value_t = false)]
-        post_to_storage_controller: bool,
        #[arg(long, default_value = None)]
        /// For safekeeper node_kind only, points to db with debug dump
        dump_db_connstr: Option<String>,
@@ -119,20 +114,11 @@ async fn main() -> anyhow::Result<()> {
        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
    ));

-    let controller_client_conf = cli.controller_api.map(|controller_api| {
-        ControllerClientConfig {
-            controller_api,
-            // Default to no key: this is a convenience when working in a development environment
-            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
-        }
-    });
-
    match cli.command {
        Command::ScanMetadata {
            json,
            tenant_ids,
            node_kind,
-            post_to_storage_controller,
            dump_db_connstr,
            dump_db_table,
        } => {
@@ -171,9 +157,6 @@ async fn main() -> anyhow::Result<()> {
                }
                Ok(())
            } else {
-                if controller_client_conf.is_none() && post_to_storage_controller {
-                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
-                }
                match scan_metadata(bucket_config.clone(), tenant_ids).await {
                    Err(e) => {
                        tracing::error!("Failed: {e}");
@@ -185,21 +168,6 @@ async fn main() -> anyhow::Result<()> {
                        } else {
                            println!("{}", summary.summary_string());
                        }
-
-                        if post_to_storage_controller {
-                            if let Some(conf) = controller_client_conf {
-                                let controller_client = conf.build_client();
-                                let body = summary.build_health_update_request();
-                                controller_client
-                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
-                                        Method::POST,
-                                        "control/v1/metadata_health/update".to_string(),
-                                        Some(body),
-                                    )
-                                    .await?;
-                            }
-                        }
-
                        if summary.is_fatal() {
                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
                        } else if summary.is_empty() {
@@ -228,11 +196,9 @@ async fn main() -> anyhow::Result<()> {
            let console_config = ConsoleConfig::from_env()?;
            find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
        }
-        Command::PurgeGarbage {
-            input_path,
-            mode,
-            min_age,
-        } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
+        Command::PurgeGarbage { input_path, mode } => {
+            purge_garbage(input_path, mode, !cli.delete).await
+        }
        Command::TenantSnapshot {
            tenant_id,
            output_path,
@@ -247,6 +213,14 @@ async fn main() -> anyhow::Result<()> {
            min_age,
            mode,
        } => {
+            let controller_client_conf = cli.controller_api.map(|controller_api| {
+                ControllerClientConfig {
+                    controller_api,
+                    // Default to no key: this is a convenience when working in a development environment
+                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+                }
+            });
+
            match (&controller_client_conf, mode) {
                (Some(_), _) => {
                    // Any mode may run when controller API is set
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -1,41 +1,12 @@
-use std::str::FromStr;
-
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode};
 use tokio_stream::Stream;

-use crate::{
-    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
-    TenantShardTimelineId,
-};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};

-/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
-pub fn stream_tenants_generic<'a>(
-    remote_client: &'a GenericRemoteStorage,
-    target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
-    try_stream! {
-        let tenants_target = target.tenants_root();
-        let mut tenants_stream =
-            std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
-        while let Some(chunk) = tenants_stream.next().await {
-            let chunk = chunk?;
-            let entry_ids = chunk.prefixes.iter()
-                .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
-            for dir_name_res in entry_ids {
-                let dir_name = dir_name_res?;
-                let id = TenantShardId::from_str(dir_name)?;
-                yield id;
-            }
-        }
-    }
-}
-
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
    s3_client: &'a Client,
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -567,7 +567,13 @@ pub async fn pageserver_physical_gc(
    }

    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
+    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
+        let ControllerClientConfig {
+            controller_api,
+            controller_jwt,
+        } = c;
+        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
+    }) else {
        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
        return Ok(summary);
    };
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -9,13 +9,12 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
-use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;

-#[derive(Serialize, Default)]
+#[derive(Serialize)]
 pub struct MetadataSummary {
    tenant_count: usize,
    timeline_count: usize,
@@ -24,16 +23,19 @@ pub struct MetadataSummary {
    with_warnings: HashSet<TenantShardTimelineId>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,
-
-    #[serde(skip)]
-    pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
-    #[serde(skip)]
-    pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
 }

 impl MetadataSummary {
    fn new() -> Self {
-        Self::default()
+        Self {
+            tenant_count: 0,
+            timeline_count: 0,
+            timeline_shard_count: 0,
+            with_errors: HashSet::new(),
+            with_warnings: HashSet::new(),
+            with_orphans: HashSet::new(),
+            indices_by_version: HashMap::new(),
+        }
    }

    fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -52,13 +54,6 @@ impl MetadataSummary {
    }

    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
-        if analysis.is_healthy() {
-            self.healthy_tenant_shards.insert(id.tenant_shard_id);
-        } else {
-            self.healthy_tenant_shards.remove(&id.tenant_shard_id);
-            self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
-        }
-
        if !analysis.errors.is_empty() {
            self.with_errors.insert(*id);
        }
@@ -106,13 +101,6 @@ Index versions: {version_summary}
    pub fn is_empty(&self) -> bool {
        self.timeline_shard_count == 0
    }
-
-    pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
-        MetadataHealthUpdateRequest {
-            healthy_tenant_shards: self.healthy_tenant_shards.clone(),
-            unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
-        }
-    }
 }

 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -222,8 +222,6 @@ class NeonBenchmarker:
    function by the zenbenchmark fixture
    """

-    PROPERTY_PREFIX = "neon_benchmarker_"
-
    def __init__(self, property_recorder: Callable[[str, object], None]):
        # property recorder here is a pytest fixture provided by junitxml module
        # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
@@ -240,7 +238,7 @@ class NeonBenchmarker:
        Record a benchmark result.
        """
        # just to namespace the value
-        name = f"{self.PROPERTY_PREFIX}_{metric_name}"
+        name = f"neon_benchmarker_{metric_name}"
        self.property_recorder(
            name,
            {
@@ -251,18 +249,6 @@ class NeonBenchmarker:
            },
        )

-    @classmethod
-    def records(
-        cls, user_properties: list[tuple[str, object]]
-    ) -> Iterator[tuple[str, dict[str, object]]]:
-        """
-        Yield all records related to benchmarks
-        """
-        for property_name, recorded_property in user_properties:
-            if property_name.startswith(cls.PROPERTY_PREFIX):
-                assert isinstance(recorded_property, dict)
-                yield recorded_property["name"], recorded_property
-
    @contextmanager
    def record_duration(self, metric_name: str) -> Iterator[None]:
        """
@@ -439,11 +425,10 @@ def zenbenchmark(
    yield benchmarker

    results = {}
-    for _, recorded_property in NeonBenchmarker.records(request.node.user_properties):
+    for _, recorded_property in request.node.user_properties:
        name = recorded_property["name"]
        value = str(recorded_property["value"])
-        unit = str(recorded_property["unit"]).strip()
-        if unit != "":
+        if (unit := recorded_property["unit"].strip()) != "":
            value += f" {unit}"
        results[name] = value

@@ -492,7 +477,7 @@ def pytest_terminal_summary(
    for test_report in terminalreporter.stats.get("passed", []):
        result_entry = []

-        for _, recorded_property in NeonBenchmarker.records(test_report.user_properties):
+        for _, recorded_property in test_report.user_properties:
            if not is_header_printed:
                terminalreporter.section("Benchmark results", "-")
                is_header_printed = True
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -449,7 +449,6 @@ class TokenScope(str, Enum):
    GENERATIONS_API = "generations_api"
    SAFEKEEPER_DATA = "safekeeperdata"
    TENANT = "tenant"
-    SCRUBBER = "scrubber"


 class NeonEnvBuilder:
@@ -1442,7 +1441,6 @@ def neon_env_builder(
    pageserver_virtual_file_io_engine: str,
    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
    pageserver_aux_file_policy: Optional[AuxFileStore],
-    record_property: Callable[[str, object], None],
 ) -> Iterator[NeonEnvBuilder]:
    """
    Fixture to create a Neon environment for test.
@@ -1482,7 +1480,9 @@ def neon_env_builder(
        yield builder
        # Propogate `preserve_database_files` to make it possible to use in other fixtures,
        # like `test_output_dir` fixture for attaching all database files to Allure report.
-        record_property("preserve_database_files", builder.preserve_database_files)
+        request.node.user_properties.append(
+            ("preserve_database_files", builder.preserve_database_files)
+        )


@dataclass
@@ -2587,51 +2587,6 @@ class NeonStorageController(MetricsGetter, LogUtils):

                time.sleep(backoff)

-    def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
-        body: Dict[str, Any] = {
-            "healthy_tenant_shards": [str(t) for t in healthy],
-            "unhealthy_tenant_shards": [str(t) for t in unhealthy],
-        }
-
-        self.request(
-            "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
-            json=body,
-            headers=self.headers(TokenScope.SCRUBBER),
-        )
-
-    def metadata_health_list_unhealthy(self):
-        response = self.request(
-            "GET",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        return response.json()
-
-    def metadata_health_list_outdated(self, duration: str):
-        body: Dict[str, Any] = {"not_scrubbed_for": duration}
-
-        response = self.request(
-            "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
-            json=body,
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        return response.json()
-
-    def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
-        """Metadata is healthy if there is no unhealthy or outdated health records."""
-
-        unhealthy = self.metadata_health_list_unhealthy()
-        outdated = self.metadata_health_list_outdated(outdated_duration)
-
-        healthy = (
-            len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
-        )
-        if not healthy:
-            log.info(f"{unhealthy=}, {outdated=}")
-        return healthy
-
    def step_down(self):
        log.info("Asking storage controller to step down")
        response = self.request(
@@ -4401,11 +4356,10 @@ class StorageScrubber:
        assert stdout is not None
        return stdout

-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
-        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
-        if post_to_storage_controller:
-            args.append("--post")
-        stdout = self.scrubber_cli(args, timeout=30)
+    def scan_metadata(self) -> Any:
+        stdout = self.scrubber_cli(
+            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
+        )

        try:
            return json.loads(stdout)
--- a/test_runner/logical_repl/test_log_repl.py
+++ b/test_runner/logical_repl/test_log_repl.py
@@ -1,88 +0,0 @@
-"""
-Test the logical replication in Neon with the different consumers
-"""
-
-import hashlib
-import time
-
-import clickhouse_connect
-import psycopg2
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import RemotePostgres
-from fixtures.utils import wait_until
-
-
-def query_clickhouse(
-    client,
-    query: str,
-    digest: str,
-) -> None:
-    """
-    Run the query on the client
-    return answer if successful, raise an exception otherwise
-    """
-    log.debug("Query: %s", query)
-    res = client.query(query)
-    log.debug(res.result_rows)
-    m = hashlib.sha1()
-    m.update(repr(tuple(res.result_rows)).encode())
-    hash_res = m.hexdigest()
-    log.debug("Hash: %s", hash_res)
-    if hash_res == digest:
-        return
-    raise ValueError("Hash mismatch")
-
-
-@pytest.mark.remote_cluster
-def test_clickhouse(remote_pg: RemotePostgres):
-    """
-    Test the logical replication having ClickHouse as a client
-    """
-    conn_options = remote_pg.conn_options()
-    for _ in range(5):
-        try:
-            conn = psycopg2.connect(remote_pg.connstr())
-        except psycopg2.OperationalError as perr:
-            log.debug(perr)
-            time.sleep(1)
-        else:
-            break
-        raise TimeoutError
-    cur = conn.cursor()
-    cur.execute("DROP TABLE IF EXISTS table1")
-    cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
-    cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
-    conn.commit()
-    client = clickhouse_connect.get_client(host="clickhouse")
-    client.command("SET allow_experimental_database_materialized_postgresql=1")
-    client.command(
-        "CREATE DATABASE db1_postgres ENGINE = "
-        f"MaterializedPostgreSQL('{conn_options['host']}', "
-        f"'{conn_options['dbname']}', "
-        f"'{conn_options['user']}', '{conn_options['password']}') "
-        "SETTINGS materialized_postgresql_tables_list = 'table1';"
-    )
-    wait_until(
-        120,
-        0.5,
-        lambda: query_clickhouse(
-            client,
-            "select * from db1_postgres.table1 order by 1",
-            "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
-        ),
-    )
-    cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
-    conn.commit()
-    wait_until(
-        120,
-        0.5,
-        lambda: query_clickhouse(
-            client,
-            "select * from db1_postgres.table1 order by 1",
-            "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
-        ),
-    )
-    log.debug("Sleeping before final checking if Neon is still alive")
-    time.sleep(3)
-    cur.execute("SELECT 1")
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -389,11 +389,6 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
        repeat_result = ps_http.timeline_create(
            env.pg_version, env.initial_tenant, success_timeline, timeout=60
        )
-        # remote_consistent_lsn_visible will be published only after we've
-        # confirmed the generation, which is not part of what we await during
-        # timeline creation (uploads). mask it out here to avoid flakyness.
-        del success_result["remote_consistent_lsn_visible"]
-        del repeat_result["remote_consistent_lsn_visible"]
        assert repeat_result == success_result
    finally:
        env.pageserver.stop(immediate=True)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3,7 +3,7 @@ import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Union

 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -1785,126 +1785,6 @@ def test_storage_controller_node_deletion(
    env.storage_controller.consistency_check()


-@pytest.mark.parametrize("shard_count", [None, 2])
-def test_storage_controller_metadata_health(
-    neon_env_builder: NeonEnvBuilder,
-    shard_count: Optional[int],
-):
-    """
-    Create three tenants A, B, C.
-
-    Phase 1:
-    - A: Post healthy status.
-    - B: Post unhealthy status.
-    - C: No updates.
-
-    Phase 2:
-    - B: Post healthy status.
-    - C: Post healthy status.
-
-    Phase 3:
-    - A: Post unhealthy status.
-
-    Phase 4:
-    - Delete tenant A, metadata health status should be deleted as well.
-    """
-
-    def update_and_query_metadata_health(
-        env: NeonEnv,
-        healthy: List[TenantShardId],
-        unhealthy: List[TenantShardId],
-        outdated_duration: str = "1h",
-    ) -> Tuple[Set[str], Set[str]]:
-        """
-        Update metadata health. Then list tenant shards with unhealthy and
-        outdated metadata health status.
-        """
-        if healthy or unhealthy:
-            env.storage_controller.metadata_health_update(healthy, unhealthy)
-        result = env.storage_controller.metadata_health_list_unhealthy()
-        unhealthy_res = set(result["unhealthy_tenant_shards"])
-        result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
-        outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
-
-        return unhealthy_res, outdated_res
-
-    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-
-    neon_env_builder.num_pageservers = 2
-    env = neon_env_builder.init_start()
-
-    # Mock tenant (`initial_tenant``) with healthy scrubber scan result
-    tenant_a_shard_ids = (
-        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
-        if shard_count is not None
-        else [TenantShardId(env.initial_tenant, 0, 0)]
-    )
-
-    # Mock tenant with unhealthy scrubber scan result
-    tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
-    tenant_b_shard_ids = (
-        env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
-        if shard_count is not None
-        else [TenantShardId(tenant_b, 0, 0)]
-    )
-
-    # Mock tenant that never gets a health update from scrubber
-    tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
-
-    tenant_c_shard_ids = (
-        env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
-        if shard_count is not None
-        else [TenantShardId(tenant_c, 0, 0)]
-    )
-
-    # Metadata health table also updated as tenant shards are created.
-    assert env.storage_controller.metadata_health_is_healthy()
-
-    # post "fake" updates to storage controller db
-
-    unhealthy, outdated = update_and_query_metadata_health(
-        env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
-    )
-
-    log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
-    assert len(unhealthy) == len(tenant_b_shard_ids)
-    for t in tenant_b_shard_ids:
-        assert str(t) in unhealthy
-    assert len(outdated) == 0
-
-    unhealthy, outdated = update_and_query_metadata_health(
-        env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
-    )
-
-    log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
-    assert len(unhealthy) == 0
-    assert len(outdated) == 0
-
-    unhealthy, outdated = update_and_query_metadata_health(
-        env, healthy=[], unhealthy=tenant_a_shard_ids
-    )
-
-    log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
-    assert len(unhealthy) == len(tenant_a_shard_ids)
-    for t in tenant_a_shard_ids:
-        assert str(t) in unhealthy
-    assert len(outdated) == 0
-
-    # Phase 4: Delete A
-    env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
-
-    # A's unhealthy metadata health status should be deleted as well.
-    assert env.storage_controller.metadata_health_is_healthy()
-
-    # All shards from B and C are not fresh if set outdated duration to 0 seconds.
-    unhealthy, outdated = update_and_query_metadata_health(
-        env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
-    )
-    assert len(unhealthy) == 0
-    for t in tenant_b_shard_ids + tenant_c_shard_ids:
-        assert str(t) in outdated
-
-
 def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
    """
    Test the `/control/v1/step_down` storage controller API. Upon receiving such
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -440,12 +440,10 @@ def test_scrubber_scan_pageserver_metadata(
    assert len(index.layer_metadata) > 0
    it = iter(index.layer_metadata.items())

-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    scan_summary = env.storage_scrubber.scan_metadata()
    assert not scan_summary["with_warnings"]
    assert not scan_summary["with_errors"]

-    assert env.storage_controller.metadata_health_is_healthy()
-
    # Delete a layer file that is listed in the index.
    layer, metadata = next(it)
    log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -455,17 +453,7 @@ def test_scrubber_scan_pageserver_metadata(
    )
    log.info(f"delete response: {delete_response}")

-    # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
+    # Check scan summary. Expect it to be a L0 layer so only emit warnings.
    scan_summary = env.storage_scrubber.scan_metadata()
    log.info(f"{pprint.pformat(scan_summary)}")
    assert len(scan_summary["with_warnings"]) > 0
-
-    assert env.storage_controller.metadata_health_is_healthy()
-
-    # Now post to storage controller, expect seeing one unhealthy health record
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
-    log.info(f"{pprint.pformat(scan_summary)}")
-    assert len(scan_summary["with_warnings"]) > 0
-
-    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
-    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
Author	SHA1	Message	Date
Conrad Ludgate	ef7e96fb4e	tweak comments	2024-07-29 11:41:44 +01:00
Conrad Ludgate	54c5196f75	proxy: improve performance of leaky-bucket	2024-07-28 23:00:21 +01:00