Merge branch 'layer-stats' of github.com:neondatabase/neon into layer-stats

Address comments
accept suggestion
2026-06-22 14:50:38 +00:00 · 2023-06-10 10:47:42 -04:00 · 2023-06-10 10:46:28 -04:00 · 2023-06-10 10:35:13 -04:00 · 2023-06-02 16:46:48 -04:00 · 2023-06-02 08:28:13 -04:00
75 changed files with 3557 additions and 1503 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -407,9 +407,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate

      - uses: actions/github-script@v6
-        if: >
-          !cancelled() &&
-          github.event_name == 'pull_request'
+        if: ${{ !cancelled() }}
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -419,7 +417,7 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

-            const script = require("./scripts/pr-comment-test-report.js")
+            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
@@ -494,19 +492,24 @@ jobs:
        env:
          COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          scripts/coverage \
-            --dir=/tmp/coverage report \
+          scripts/coverage --dir=/tmp/coverage \
+            report \
            --input-objects=/tmp/coverage/binaries.list \
            --commit-url=${COMMIT_URL} \
            --format=github

+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --format=lcov
+
      - name: Upload coverage report
        id: upload-coverage-report
        env:
          BUCKET: neon-github-public-dev
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}

          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
@@ -661,6 +664,9 @@ jobs:
          project: nrdv0s4kcs
          push: true
          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+          build-args: |
+            GIT_VERSION=${{ github.sha }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
@@ -775,7 +781,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.7.3-alpha3
+      VM_BUILDER_VERSION: v0.8.0

    steps:
      - name: Checkout
@@ -796,7 +802,7 @@ jobs:

      - name: Build vm image
        run: |
-          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.25"
+aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -632,6 +632,7 @@ RUN apt update &&  \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
+        libcurl4-openssl-dev \
        procps && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -362,6 +362,8 @@ impl ComputeNode {
        };

        // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        client.simple_query("SET neon.forward_ddl = false")?;
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
@@ -403,7 +405,9 @@ impl ComputeNode {
        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
+            client.simple_query("SET neon.forward_ddl = false")?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .init();
    tracing::info!("logging and tracing started");

+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
    Ok(())
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -121,9 +121,8 @@ impl RoleExt for Role {
    /// string of arguments.
    fn to_pg_options(&self) -> String {
        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
-        // For now, we do not use generic `options` for roles. Once used, add
-        // `self.options.as_pg_options()` somewhere here.
-        let mut params: String = "LOGIN".to_string();
+        let mut params: String = self.options.as_pg_options();
+        params.push_str(" LOGIN");

        if let Some(pass) = &self.encrypted_password {
            // Some time ago we supported only md5 and treated all encrypted_password as md5.
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -62,7 +62,7 @@ fn do_control_plane_request(
    }
 }

-/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
 /// env variable is set, it will be used for authorization.
 pub fn get_spec_from_control_plane(
    base_uri: &str,
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -16,7 +16,7 @@ mod pg_helpers_tests {
        );
        assert_eq!(
            spec.cluster.roles.first().unwrap().to_pg_options(),
-            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+            " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
        );
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "14";
+const DEFAULT_PG_VERSION: &str = "15";

 fn default_conf() -> String {
    format!(
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -24,7 +24,7 @@ use utils::{

 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 //
 // This data structures represents neon_local CLI config
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -369,7 +369,16 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
+            gc_feedback: settings
+                .remove("gc_feedback")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_feedback' as bool")?,
        };
+
+        // If tenant ID was not specified, generate one
+        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
+
        let request = models::TenantCreateRequest {
            new_tenant_id,
            config,
@@ -459,6 +468,11 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
+                gc_feedback: settings
+                    .remove("gc_feedback")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'gc_feedback' as bool")?,
            }
        };

@@ -495,6 +509,9 @@ impl PageServerNode {
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
    ) -> anyhow::Result<TimelineInfo> {
+        // If timeline ID was not specified, generate one
+        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
+
        self.http_request(
            Method::POST,
            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 set -eux

+# Generate a random tenant or timeline ID
+#
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+
 PG_VERSION=${PG_VERSION:-14}

 SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -13,29 +21,29 @@ done
 echo "Page server is ready."

 echo "Create a tenant and timeline"
+generate_id tenant_id
 PARAMS=(
     -sb 
     -X POST
     -H "Content-Type: application/json"
-     -d "{}"
+     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
     http://pageserver:9898/v1/tenant/
 )
-tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .

+generate_id timeline_id
 PARAMS=(
     -sb 
     -X POST
     -H "Content-Type: application/json"
-     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .

 echo "Overwrite tenant id and timeline id in spec file"
-tenant_id=$(echo ${result} | jq -r .tenant_id)
-timeline_id=$(echo ${result} | jq -r .timeline_id)
-
 sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations
 by spawning a separate task. The code that handles incoming HTTP
 requests, for example, spawns a separate task for each request,
 because Hyper will drop the request-handling Future if the HTTP
-connection is lost.  (FIXME: our HTTP handlers do not do that
-currently, but we should fix that. See [issue
-3478](https://github.com/neondatabase/neon/issues/3478)).
+connection is lost.


 #### How to cancel, then?
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,29 @@ use crate::reltag::RelTag;
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

-/// A state of a tenant in pageserver's memory.
+/// The state of a tenant in this pageserver.
+///
+/// ```mermaid
+/// stateDiagram-v2
+///
+///     [*] --> Loading: spawn_load()
+///     [*] --> Attaching: spawn_attach()
+///
+///     Loading --> Activating: activate()
+///     Attaching --> Activating: activate()
+///     Activating --> Active: infallible
+///
+///     Loading --> Broken: load() failure
+///     Attaching --> Broken: attach() failure
+///
+///     Active --> Stopping: set_stopping(), part of shutdown & detach
+///     Stopping --> Broken: late error in remove_tenant_from_memory
+///
+///     Broken --> [*]: ignore / detach / shutdown
+///     Stopping --> [*]: remove_from_memory complete
+///
+///     Active --> Broken: cfg(testing)-only tenant break point
+/// ```
 #[derive(
    Clone,
    PartialEq,
@@ -26,40 +48,63 @@ use bytes::{BufMut, Bytes, BytesMut};
    serde::Serialize,
    serde::Deserialize,
    strum_macros::Display,
-    strum_macros::EnumString,
    strum_macros::EnumVariantNames,
    strum_macros::AsRefStr,
    strum_macros::IntoStaticStr,
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk
+    /// This tenant is being loaded from local disk.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
    Loading,
-    /// This tenant is being downloaded from cloud storage.
+    /// This tenant is being attached to the pageserver.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
    Attaching,
-    /// Tenant is fully operational
+    /// The tenant is transitioning from Loading/Attaching to Active.
+    ///
+    /// While in this state, the individual timelines are being activated.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
+    Activating(ActivatingFrom),
+    /// The tenant has finished activating and is open for business.
+    ///
+    /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
    Active,
-    /// A tenant is recognized by pageserver, but it is being detached or the
+    /// The tenant is recognized by pageserver, but it is being detached or the
    /// system is being shut down.
+    ///
+    /// Transitions out of this state are possible through `set_broken()`.
    Stopping,
-    /// A tenant is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The tenant is recognized by the pageserver, but can no longer be used for
+    /// any operations.
+    ///
+    /// If the tenant fails to load or attach, it will transition to this state
+    /// and it is guaranteed that no background tasks are running in its name.
+    ///
+    /// The other way to transition into this state is from `Stopping` state
+    /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
+    /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
    Broken { reason: String, backtrace: String },
 }

 impl TenantState {
    pub fn attachment_status(&self) -> TenantAttachmentStatus {
        use TenantAttachmentStatus::*;
+
+        // Below TenantState::Activating is used as "transient" or "transparent" state for
+        // attachment_status determining.
        match self {
            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
            // So, technically, we can return Attached here.
            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
            // But, our attach task might still be fetching the remote timelines, etc.
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
-            Self::Attaching => Maybe,
+            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
-            Self::Loading => Attached,
+            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
            Self::Active => Attached,
@@ -98,6 +143,15 @@ impl std::fmt::Debug for TenantState {
    }
 }

+/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum ActivatingFrom {
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
+    Loading,
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
+    Attaching,
+}
+
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
@@ -118,9 +172,8 @@ pub enum TimelineState {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_timeline_id: TimelineId,
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
@@ -131,12 +184,11 @@ pub struct TimelineCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_tenant_id: Option<TenantId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -171,6 +223,7 @@ pub struct TenantConfig {
    pub eviction_policy: Option<serde_json::Value>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    pub gc_feedback: Option<bool>,
 }

 #[serde_as]
@@ -184,10 +237,10 @@ pub struct StatusResponse {
 }

 impl TenantCreateRequest {
-    pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
+    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
        TenantCreateRequest {
            new_tenant_id,
-            ..Default::default()
+            config: TenantConfig::default(),
        }
    }
 }
@@ -229,6 +282,7 @@ impl TenantConfigRequest {
            eviction_policy: None,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: None,
+            gc_feedback: None,
        };
        TenantConfigRequest { tenant_id, config }
    }
@@ -831,4 +885,55 @@ mod tests {
            err
        );
    }
+
+    #[test]
+    fn tenantstatus_activating_serde() {
+        let states = [
+            TenantState::Activating(ActivatingFrom::Loading),
+            TenantState::Activating(ActivatingFrom::Attaching),
+        ];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+
+        let actual = serde_json::to_string(&states).unwrap();
+
+        assert_eq!(actual, expected);
+
+        let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
+
+        assert_eq!(states.as_slice(), &parsed);
+    }
+
+    #[test]
+    fn tenantstatus_activating_strum() {
+        // tests added, because we use these for metrics
+        let examples = [
+            (line!(), TenantState::Loading, "Loading"),
+            (line!(), TenantState::Attaching, "Attaching"),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Loading),
+                "Activating",
+            ),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Attaching),
+                "Activating",
+            ),
+            (line!(), TenantState::Active, "Active"),
+            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Broken {
+                    reason: "Example".into(),
+                    backtrace: "Looooong backtrace".into(),
+                },
+                "Broken",
+            ),
+        ];
+
+        for (line, rendered, expected) in examples {
+            let actual: &'static str = rendered.into();
+            assert_eq!(actual, expected, "example on {line}");
+        }
+    }
 }
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use tokio::sync::{mpsc, Mutex};
+
+/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
+///
+/// Can be cloned, moved and kept around in futures as "guard objects".
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);
+
+/// Barrier will wait until all clones of [`Completion`] have been dropped.
+#[derive(Clone)]
+pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+
+impl Barrier {
+    pub async fn wait(self) {
+        self.0.lock().await.recv().await;
+    }
+
+    pub async fn maybe_wait(barrier: Option<Barrier>) {
+        if let Some(b) = barrier {
+            b.wait().await
+        }
+    }
+}
+
+/// Create new Guard and Barrier pair.
+pub fn channel() -> (Completion, Barrier) {
+    let (tx, rx) = mpsc::channel::<()>(1);
+    let rx = Mutex::new(rx);
+    let rx = Arc::new(rx);
+    (Completion(tx), Barrier(rx))
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,5 +1,5 @@
 use crate::auth::{Claims, JwtAuth};
-use crate::http::error;
+use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
@@ -16,8 +16,6 @@ use std::future::Future;
 use std::net::TcpListener;
 use std::str::FromStr;

-use super::error::ApiError;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -35,8 +33,18 @@ struct RequestId(String);
 /// Adds a tracing info_span! instrumentation around the handler events,
 /// logs the request start and end events for non-GET requests and non-200 responses.
 ///
+/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
+///
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
-/// in this type will get request info logged in the wrapping span, including the unique request ID.
+/// with this will get request info logged in the wrapping span, including the unique request ID.
+///
+/// This also handles errors, logging them and converting them to an HTTP error response.
+///
+/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
+/// completion. In other words, the handler must be async cancellation safe! request_span
+/// prints a warning to the log when that happens, so that you have some trace of it in
+/// the log.
+///
 ///
 /// There could be other ways to implement similar functionality:
 ///
@@ -54,60 +62,56 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<E, R, H>(pub H)
+pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
-
-impl<E, R, H> RequestSpan<E, R, H>
-where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
-    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
-    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
-        let request_id = request.context::<RequestId>().unwrap_or_default().0;
-        let method = request.method();
-        let path = request.uri().path();
-        let request_span = info_span!("request", %method, %path, %request_id);
+    let request_id = request.context::<RequestId>().unwrap_or_default().0;
+    let method = request.method();
+    let path = request.uri().path();
+    let request_span = info_span!("request", %method, %path, %request_id);

-        let log_quietly = method == Method::GET;
-        async move {
-            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
-            if log_quietly {
-                debug!("Handling request");
-            } else {
-                info!("Handling request");
-            }
-
-            // Note that we reuse `error::handler` here and not returning and error at all,
-            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
-            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
-            //
-            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
-            let res = (self.0)(request).await;
-
-            cancellation_guard.disarm();
-
-            match res {
-                Ok(response) => {
-                    let response_status = response.status();
-                    if log_quietly && response_status.is_success() {
-                        debug!("Request handled, status: {response_status}");
-                    } else {
-                        info!("Request handled, status: {response_status}");
-                    }
-                    Ok(response)
-                }
-                Err(e) => Ok(error::handler(e.into()).await),
-            }
+    let log_quietly = method == Method::GET;
+    async move {
+        let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
+        if log_quietly {
+            debug!("Handling request");
+        } else {
+            info!("Handling request");
+        }
+
+        // No special handling for panics here. There's a `tracing_panic_hook` from another
+        // module to do that globally.
+        let res = handler(request).await;
+
+        cancellation_guard.disarm();
+
+        // Log the result if needed.
+        //
+        // We also convert any errors into an Ok response with HTTP error code here.
+        // `make_router` sets a last-resort error handler that would do the same, but
+        // we prefer to do it here, before we exit the request span, so that the error
+        // is still logged with the span.
+        //
+        // (Because we convert errors to Ok response, we never actually return an error,
+        // and we could declare the function to return the never type (`!`). However,
+        // using `routerify::RouterBuilder` requires a proper error type.)
+        match res {
+            Ok(response) => {
+                let response_status = response.status();
+                if log_quietly && response_status.is_success() {
+                    debug!("Request handled, status: {response_status}");
+                } else {
+                    info!("Request handled, status: {response_status}");
+                }
+                Ok(response)
+            }
+            Err(err) => Ok(api_error_handler(err)),
        }
-        .instrument(request_span)
-        .await
    }
+    .instrument(request_span)
+    .await
 }

 /// Drop guard to WARN in case the request was dropped before completion.
@@ -207,10 +211,8 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
-        .get("/metrics", |r| {
-            RequestSpan(prometheus_metrics_handler).handle(r)
-        })
-        .err_handler(error::handler)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .err_handler(route_error_handler)
 }

 pub fn attach_openapi_ui(
@@ -220,12 +222,14 @@ pub fn attach_openapi_ui(
    ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    router_builder
-        .get(spec_mount_path, move |r| {
-            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
-                .handle(r)
-        })
-        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
-            Ok(Response::builder().body(Body::from(format!(r#"
+        .get(spec_mount_path,
+            move |r| request_span(r, move |_| async move {
+                Ok(Response::builder().body(Body::from(spec)).unwrap())
+            })
+        )
+        .get(ui_mount_path,
+             move |r| request_span(r, move |_| async move {
+                 Ok(Response::builder().body(Body::from(format!(r#"
                <!DOCTYPE html>
                <html lang="en">
                <head>
@@ -255,7 +259,8 @@ pub fn attach_openapi_ui(
                </body>
                </html>
            "#, spec_mount_path))).unwrap())
-        }).handle(r))
+             })
+        )
 }

 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -83,13 +83,24 @@ impl HttpErrorBody {
    }
 }

-pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    let api_error = err
-        .downcast::<ApiError>()
-        .expect("handler should always return api error");
+pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
+    match err.downcast::<ApiError>() {
+        Ok(api_error) => api_error_handler(*api_error),
+        Err(other_error) => {
+            // We expect all the request handlers to return an ApiError, so this should
+            // not be reached. But just in case.
+            error!("Error processing HTTP request: {other_error:?}");
+            HttpErrorBody::response_from_msg_and_status(
+                other_error.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            )
+        }
+    }
+}

+pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+    if let ApiError::InternalServerError(_) = api_error {
        error!("Error processing HTTP request: {api_error:?}");
    } else {
        error!("Error processing HTTP request: {api_error:#}");
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,6 +60,9 @@ pub mod tracing_span_assert;

 pub mod rate_limit;

+/// Simple once-barrier and a guard which keeps barrier awaiting.
+pub mod completion;
+
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -16,3 +16,4 @@ postgres_ffi.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+itertools.workspace = true
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -41,9 +41,18 @@ pub(crate) enum LayerCmd {
        /// The id from list-layer command
        id: usize,
    },
+    /// Output layer statistics
+    GetStats {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+        /// The id from list-layer command
+        id: usize,
+    },
 }

-fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+// Return (key, value.len) for all keys, sorted by key.
+fn read_delta_file(path: impl AsRef<Path>) -> Result<Vec<(Key, usize)>> {
    use pageserver::tenant::blob_io::BlobCursor;
    use pageserver::tenant::block_io::BlockReader;

@@ -70,11 +79,48 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
        },
    )?;
    let mut cursor = BlockCursor::new(&file);
+
+    let mut result = vec![];
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
-        println!("key:{} value_len:{}", k, value.len());
+        result.push((k, value.len()));
    }
    // TODO(chi): special handling for last key?
+    Ok(result)
+}
+
+// We divide the entire i128 keyspace into pre-assigned fixed segments,
+// 8MB each. Group keys by segment, and report segment size for each.
+//
+// 8MB is chosen as the segment size because we're unlikely to make
+// s3 partial downloads smaller than 8MB (due to cost). So summarizing
+// layer metadata in 8MB segments could be enough to generate good test
+// data for write amplification tests.
+//
+// Note that the segments are fixed, and don't depend on what keyspace
+// is actually in use.
+fn read_delta_segments(path: impl AsRef<Path>) -> Result<Vec<(i128, usize)>> {
+    fn key_to_segment(key: &Key) -> i128 {
+        // A page is 8KB. So 1024 pages are 8MB.
+        key.to_i128() >> 10
+    }
+
+    use itertools::Itertools;
+    let delta_metadata = read_delta_file(path)?;
+    let group_iter = delta_metadata.iter().group_by(|(k, _)| key_to_segment(k));
+    let group_sizes = group_iter.into_iter().map(|(segment, lengths_group)| {
+        let sum: usize = lengths_group.map(|(_k, len)| len).sum();
+        (segment, sum)
+    });
+    Ok(group_sizes.collect())
+}
+
+fn summarize_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    // TODO write in some compressed binary format
+    for (segment, size) in read_delta_segments(path)? {
+        println!("segment:{} size:{}", segment, size);
+    }
+
    Ok(())
 }

@@ -153,7 +199,38 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path())?;
+                            for (k, len) in read_delta_file(layer.path())? {
+                                println!("key:{} value_len:{}", k, len);
+                            }
+                        } else {
+                            anyhow::bail!("not supported yet :(");
+                        }
+
+                        break;
+                    }
+                    idx += 1;
+                }
+            }
+        }
+        LayerCmd::GetStats {
+            path,
+            tenant,
+            timeline,
+            id,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    if *id == idx {
+                        if layer_file.is_delta {
+                            summarize_delta_file(layer.path())?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
 use tracing::*;

@@ -18,9 +19,7 @@ use pageserver::{
    context::{DownloadBehavior, RequestContext},
    http, page_cache, page_service, task_mgr,
    task_mgr::TaskKind,
-    task_mgr::{
-        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
-    },
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -276,7 +275,18 @@ fn start_pageserver(
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

    // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
+    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
+    let broker_client = WALRECEIVER_RUNTIME
+        .block_on(async {
+            // Note: we do not attempt connecting here (but validate endpoints sanity).
+            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+        })
+        .with_context(|| {
+            format!(
+                "create broker client for uri={:?} keepalive_interval={:?}",
+                &conf.broker_endpoint, conf.broker_keepalive_interval,
+            )
+        })?;

    // Initialize authentication for incoming connections
    let http_auth;
@@ -325,8 +335,33 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // All tenant load operations carry this while they are ongoing; it will be dropped once those
+    // operations finish either successfully or in some other manner. However, the initial load
+    // will be then done, and we can start the global background tasks.
+    let (init_done_tx, init_done_rx) = utils::completion::channel();
+
    // Scan the local 'tenants/' directory and start loading the tenants
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
+    let init_started_at = std::time::Instant::now();
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+        conf,
+        broker_client.clone(),
+        remote_storage.clone(),
+        (init_done_tx, init_done_rx.clone()),
+    ))?;
+
+    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx.clone();
+        async move {
+            init_done_rx.wait().await;
+
+            let elapsed = init_started_at.elapsed();
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed."
+            );
+        }
+    });

    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -339,6 +374,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            init_done_rx.clone(),
        )?;
    }

@@ -351,6 +387,7 @@ fn start_pageserver(
            conf,
            launch_ts,
            http_auth,
+            broker_client.clone(),
            remote_storage,
            disk_usage_eviction_state,
        )?
@@ -375,6 +412,7 @@ fn start_pageserver(
        );

        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let init_done_rx = init_done_rx;
            let metrics_ctx = RequestContext::todo_child(
                TaskKind::MetricsCollection,
                // This task itself shouldn't download anything.
@@ -390,6 +428,13 @@ fn start_pageserver(
                "consumption metrics collection",
                true,
                async move {
+                    // first wait for initial load to complete before first iteration.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    init_done_rx.wait().await;
+
                    pageserver::consumption_metrics::collect_metrics(
                        metric_collection_endpoint,
                        conf.metric_collection_interval,
@@ -427,6 +472,7 @@ fn start_pageserver(
            async move {
                page_service::libpq_listener_main(
                    conf,
+                    broker_client,
                    pg_auth,
                    pageserver_listener,
                    conf.pg_auth_type,
--- a/pageserver/src/broker_client.rs
+++ b/pageserver/src/broker_client.rs
@@ -1,48 +0,0 @@
-//! The broker client instance of the pageserver, created during pageserver startup.
-//! Used by each timelines' [`walreceiver`].
-
-use crate::config::PageServerConf;
-
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-use storage_broker::BrokerClientChannel;
-use tracing::*;
-
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -108,7 +108,7 @@ pub mod defaults {

 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
+#gc_feedback = false
 # [remote_storage]

 "###
@@ -828,6 +828,14 @@ impl PageServerConf {
            )?);
        }

+        if let Some(gc_feedback) = item.get("gc_feedback") {
+            t_conf.gc_feedback = Some(
+                gc_feedback
+                    .as_bool()
+                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
+            );
+        }
+
        Ok(t_conf)
    }

--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,6 +88,7 @@
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
+#[derive(Clone, Debug)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
@@ -95,7 +96,7 @@ pub struct RequestContext {

 /// Desired behavior if the operation requires an on-demand download
 /// to proceed.
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum DownloadBehavior {
    /// Download the layer file. It can take a while.
    Download,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::completion;
 use utils::serde_percent::Percent;

 use crate::{
@@ -82,6 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    init_done: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -98,6 +100,9 @@ pub fn launch_disk_usage_global_eviction_task(
        "disk usage based eviction",
        false,
        async move {
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            init_done.wait().await;
+
            disk_usage_eviction_task(
                &state,
                task_config,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -678,6 +678,8 @@ paths:
          application/json:
            schema:
              type: object
+              required:
+                - new_timeline_id
              properties:
                new_timeline_id:
                  type: string
@@ -936,6 +938,8 @@ components:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
        - type: object
+          required:
+            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,3 +1,6 @@
+//!
+//! Management HTTP API
+//!
 use std::collections::HashMap;
 use std::sync::Arc;

@@ -7,10 +10,11 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::http::endpoint::RequestSpan;
+use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

@@ -24,7 +28,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
+use crate::tenant::mgr::{
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
@@ -43,7 +49,6 @@ use utils::{
 };

 // Imports only used for testing APIs
-#[cfg(feature = "testing")]
 use super::models::ConfigureFailpointsRequest;

 struct State {
@@ -51,6 +56,7 @@ struct State {
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
+    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }

@@ -59,6 +65,7 @@ impl State {
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
+        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
@@ -70,6 +77,7 @@ impl State {
            auth,
            allowlist_routes,
            remote_storage,
+            broker_client,
            disk_usage_eviction_state,
        })
    }
@@ -140,6 +148,36 @@ impl From<TenantStateError> for ApiError {
    }
 }

+impl From<GetTenantError> for ApiError {
+    fn from(tse: GetTenantError) -> ApiError {
+        match tse {
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            e @ GetTenantError::NotActive(_) => {
+                // Why is this not `ApiError::NotFound`?
+                // Because we must be careful to never return 404 for a tenant if it does
+                // in fact exist locally. If we did, the caller could draw the conclusion
+                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid))
+            }
+            e @ SetNewTenantConfigError::Persist(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -159,7 +197,7 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(TenantStateError::NotFound(..)) => {
+            Tenant(GetTenantError::NotFound(..)) => {
                ApiError::PreconditionFailed("Requested tenant is missing")
            }
            Tenant(t) => ApiError::from(t),
@@ -254,23 +292,29 @@ fn build_timeline_info_common(
 }

 // healthcheck handler
-async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
    let config = get_config(&request);
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

-async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_id))?;

-    let new_timeline_id = request_data
-        .new_timeline_id
-        .unwrap_or_else(TimelineId::generate);
+    let new_timeline_id = request_data.new_timeline_id;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

+    let state = get_state(&request);
+
    async {
        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
@@ -278,6 +322,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            request_data.ancestor_timeline_id.map(TimelineId::from),
            request_data.ancestor_start_lsn,
            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            state.broker_client.clone(),
            &ctx,
        )
        .await {
@@ -291,11 +336,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            Err(err) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

-async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
@@ -329,7 +377,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, response_data)
 }

-async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_detail_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
@@ -363,7 +414,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, timeline_info)
 }

-async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_lsn_by_timestamp_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -387,7 +441,10 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    json_response(StatusCode::OK, result)
 }

-async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_attach_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -408,6 +465,7 @@ async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Bo
            state.conf,
            tenant_id,
            tenant_conf,
+            state.broker_client.clone(),
            remote_storage.clone(),
            &ctx,
        )
@@ -422,7 +480,10 @@ async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Bo
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -436,7 +497,10 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, ())
 }

-async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_detach_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
@@ -450,21 +514,33 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

-async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_load_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
-        .instrument(info_span!("load", tenant = %tenant_id))
-        .await?;
+    mgr::load_tenant(
+        state.conf,
+        tenant_id,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        &ctx,
+    )
+    .instrument(info_span!("load", tenant = %tenant_id))
+    .await?;

    json_response(StatusCode::ACCEPTED, ())
 }

-async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_ignore_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -477,7 +553,10 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

-async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

    let response_data = mgr::list_tenants()
@@ -497,7 +576,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::OK, response_data)
 }

-async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_status(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -511,7 +593,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        }

        let state = tenant.current_state();
-        Ok(TenantInfo {
+        Result::<_, ApiError>::Ok(TenantInfo {
            id: tenant_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
@@ -519,8 +601,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        })
    }
    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
-    .await
-    .map_err(ApiError::InternalServerError)?;
+    .await?;

    json_response(StatusCode::OK, tenant_info)
 }
@@ -538,7 +619,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// Note: we don't update the cached size and prometheus metric here.
 /// The retention period might be different, and it's nice to have a method to just calculate it
 /// without modifying anything anyway.
-async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
@@ -603,7 +687,10 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
    )
 }

-async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_map_info_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
@@ -617,7 +704,10 @@ async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>
    json_response(StatusCode::OK, layer_map_info)
 }

-async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -640,7 +730,10 @@ async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>
    }
 }

-async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn evict_timeline_layer_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -718,7 +811,12 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

-async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let target_tenant_id = request_data.new_tenant_id;
    check_permission(&request, None)?;

    let _timer = STORAGE_TIME_GLOBAL
@@ -726,17 +824,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        .expect("bug")
        .start_timer();

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    let target_tenant_id = request_data
-        .new_tenant_id
-        .map(TenantId::from)
-        .unwrap_or_else(TenantId::generate);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    let state = get_state(&request);

@@ -744,6 +835,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        state.conf,
        tenant_conf,
        target_tenant_id,
+        state.broker_client.clone(),
        state.remote_storage.clone(),
        &ctx,
    )
@@ -769,7 +861,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    )
 }

-async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_tenant_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -795,6 +890,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo

 async fn update_tenant_config_handler(
    mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let request_data: TenantConfigRequest = json_request(&mut request).await?;
    let tenant_id = request_data.tenant_id;
@@ -812,21 +908,25 @@ async fn update_tenant_config_handler(
 }

 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
-#[cfg(feature = "testing")]
-async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_break(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

-    tenant.set_broken("broken from test".to_owned());
+    tenant.set_broken("broken from test".to_owned()).await;

    json_response(StatusCode::OK, ())
 }

-#[cfg(feature = "testing")]
-async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    if !fail::has_failpoints() {
        return Err(ApiError::BadRequest(anyhow!(
            "Cannot manage failpoints because pageserver was compiled without failpoints support"
@@ -859,7 +959,10 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_gc_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -878,8 +981,10 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }

 // Run compaction immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -900,8 +1005,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
 }

 // Run checkpoint immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_checkpoint_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -925,6 +1032,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<

 async fn timeline_download_remote_layers_handler_post(
    mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -940,6 +1048,7 @@ async fn timeline_download_remote_layers_handler_post(

 async fn timeline_download_remote_layers_handler_get(
    request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -963,7 +1072,10 @@ async fn active_timeline_of_active_tenant(
        .map_err(ApiError::NotFound)
 }

-async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn always_panic_handler(
+    req: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
@@ -974,7 +1086,10 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
    json_response(StatusCode::NO_CONTENT, ())
 }

-async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn disk_usage_eviction_run(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;

    #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
@@ -1064,8 +1179,10 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    )
 }

-#[cfg(feature = "testing")]
-async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn post_tracing_event_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    #[derive(Debug, serde::Deserialize)]
    #[serde(rename_all = "lowercase")]
    enum Level {
@@ -1095,10 +1212,90 @@ async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Bod
    json_response(StatusCode::OK, ())
 }

+/// Common functionality of all the HTTP API handlers.
+///
+/// - Adds a tracing span to each request (by `request_span`)
+/// - Logs the request depending on the request method (by `request_span`)
+/// - Logs the response if it was not successful (by `request_span`
+/// - Shields the handler function from async cancellations. Hyper can drop the handler
+///   Future if the connection to the client is lost, but most of the pageserver code is
+///   not async cancellation safe. This converts the dropped future into a graceful cancellation
+///   request with a CancellationToken.
+async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    // Spawn a new task to handle the request, to protect the handler from unexpected
+    // async cancellations. Most pageserver functions are not async cancellation safe.
+    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
+    // with the cancellation token.
+    let token = CancellationToken::new();
+    let cancel_guard = token.clone().drop_guard();
+    let result = request_span(request, move |r| async {
+        let handle = tokio::spawn(
+            async {
+                let token_cloned = token.clone();
+                let result = handler(r, token).await;
+                if token_cloned.is_cancelled() {
+                    info!("Cancelled request finished");
+                }
+                result
+            }
+            .in_current_span(),
+        );
+
+        match handle.await {
+            Ok(result) => result,
+            Err(e) => {
+                // The handler task panicked. We have a global panic handler that logs the
+                // panic with its backtrace, so no need to log that here. Only log a brief
+                // message to make it clear that we returned the error to the client.
+                error!("HTTP request handler task panicked: {e:#}");
+
+                // Don't return an Error here, because then fallback error handler that was
+                // installed in make_router() will print the error. Instead, construct the
+                // HTTP error response and return that.
+                Ok(
+                    ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
+                        .into_response(),
+                )
+            }
+        }
+    })
+    .await;
+
+    cancel_guard.disarm();
+
+    result
+}
+
+/// Like api_handler, but returns an error response if the server is built without
+/// the 'testing' feature.
+async fn testing_api_handler<R, H>(
+    desc: &str,
+    request: Request<Body>,
+    handler: H,
+) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    if cfg!(feature = "testing") {
+        api_handler(request, handler).await
+    } else {
+        std::future::ready(Err(ApiError::BadRequest(anyhow!(
+            "Cannot {desc} because pageserver was compiled without testing APIs",
+        ))))
+        .await
+    }
+}
+
 pub fn make_router(
    conf: &'static PageServerConf,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
+    broker_client: BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
@@ -1123,121 +1320,99 @@ pub fn make_router(
        .expect("construct launch timestamp header middleware"),
    );

-    macro_rules! testing_api {
-        ($handler_desc:literal, $handler:path $(,)?) => {{
-            #[cfg(not(feature = "testing"))]
-            async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-                Err(ApiError::BadRequest(anyhow!(concat!(
-                    "Cannot ",
-                    $handler_desc,
-                    " because pageserver was compiled without testing APIs",
-                ))))
-            }
-
-            #[cfg(feature = "testing")]
-            let handler = $handler;
-            #[cfg(not(feature = "testing"))]
-            let handler = cfg_disabled;
-
-            move |r| RequestSpan(handler).handle(r)
-        }};
-    }
-
    Ok(router
        .data(Arc::new(
-            State::new(conf, auth, remote_storage, disk_usage_eviction_state)
-                .context("Failed to initialize router state")?,
+            State::new(
+                conf,
+                auth,
+                remote_storage,
+                broker_client,
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
        ))
-        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
-        .put(
-            "/v1/failpoints",
-            testing_api!("manage failpoints", failpoints_handler),
-        )
-        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
-        .post("/v1/tenant", |r| {
-            RequestSpan(tenant_create_handler).handle(r)
-        })
-        .get("/v1/tenant/:tenant_id", |r| {
-            RequestSpan(tenant_status).handle(r)
+        .get("/v1/status", |r| api_handler(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
+        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
+        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
-            RequestSpan(tenant_size_handler).handle(r)
+            api_handler(r, tenant_size_handler)
        })
        .put("/v1/tenant/config", |r| {
-            RequestSpan(update_tenant_config_handler).handle(r)
+            api_handler(r, update_tenant_config_handler)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            RequestSpan(get_tenant_config_handler).handle(r)
+            api_handler(r, get_tenant_config_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_list_handler).handle(r)
+            api_handler(r, timeline_list_handler)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_create_handler).handle(r)
+            api_handler(r, timeline_create_handler)
        })
        .post("/v1/tenant/:tenant_id/attach", |r| {
-            RequestSpan(tenant_attach_handler).handle(r)
+            api_handler(r, tenant_attach_handler)
        })
        .post("/v1/tenant/:tenant_id/detach", |r| {
-            RequestSpan(tenant_detach_handler).handle(r)
+            api_handler(r, tenant_detach_handler)
        })
        .post("/v1/tenant/:tenant_id/load", |r| {
-            RequestSpan(tenant_load_handler).handle(r)
+            api_handler(r, tenant_load_handler)
        })
        .post("/v1/tenant/:tenant_id/ignore", |r| {
-            RequestSpan(tenant_ignore_handler).handle(r)
+            api_handler(r, tenant_ignore_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_detail_handler).handle(r)
+            api_handler(r, timeline_detail_handler)
        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
+            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            RequestSpan(timeline_gc_handler).handle(r)
+            api_handler(r, timeline_gc_handler)
+        })
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
+            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
        })
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
-            testing_api!("run timeline compaction", timeline_compact_handler),
-        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
-            testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
+            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
        )
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
+            |r| api_handler(r, timeline_download_remote_layers_handler_post),
        )
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
+            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_delete_handler).handle(r)
+            api_handler(r, timeline_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            RequestSpan(layer_map_info_handler).handle(r)
+            api_handler(r, layer_map_info_handler)
        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(layer_download_handler).handle(r),
+            |r| api_handler(r, layer_download_handler),
        )
        .delete(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
+            |r| api_handler(r, evict_timeline_layer_handler),
        )
        .put("/v1/disk_usage_eviction/run", |r| {
-            RequestSpan(disk_usage_eviction_run).handle(r)
+            api_handler(r, disk_usage_eviction_run)
+        })
+        .put("/v1/tenant/:tenant_id/break", |r| {
+            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
+        })
+        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
+        .post("/v1/tracing/event", |r| {
+            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
-        .put(
-            "/v1/tenant/:tenant_id/break",
-            testing_api!("set tenant state to broken", handle_tenant_break),
-        )
-        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
-        .post(
-            "/v1/tracing/event",
-            testing_api!("emit a tracing event", post_tracing_event_handler),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,6 +1,5 @@
 mod auth;
 pub mod basebackup;
-pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
@@ -36,7 +35,7 @@ use tracing::info;
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -46,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

+#[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_read_num_fs_layers",
+        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
+        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
 // Metrics collected on operations on the storage repository.
 static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
@@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_materialized_cache_hits_direct_total",
+        "Number of cache hits from materialized page cache without redo",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_getpage_get_reconstruct_data_seconds",
+        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_materialized_cache_hits_total",
@@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    0.001000, // 1000 usec
    0.030,    // 30 ms
    1.000,    // 1000 ms
+    30.000,   // 30000 ms
 ];

 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the WAL redo process",
+        "Time spent waiting for access to the Postgres WAL redo process",
        redo_histogram_time_buckets!(),
    )
    .expect("failed to define a metric")
@@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
        redo_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
@@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo sent to Postgres",
        redo_bytes_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
@@ -723,7 +753,9 @@ pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
    pub reconstruct_time_histo: Histogram,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
+    pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -734,6 +766,7 @@ pub struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -753,6 +786,9 @@ impl TimelineMetrics {
        let reconstruct_time_histo = RECONSTRUCT_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -794,6 +830,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

@@ -801,7 +843,9 @@ impl TimelineMetrics {
            tenant_id,
            timeline_id,
            reconstruct_time_histo,
+            get_reconstruct_data_time_histo,
            materialized_page_cache_hit_counter,
+            materialized_page_cache_hit_upon_request_counter,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -819,6 +863,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -828,7 +873,9 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -836,6 +883,8 @@ impl Drop for TimelineMetrics {
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
+
        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::mgr;
+use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;

@@ -172,6 +174,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<JwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -213,7 +216,14 @@ pub async fn libpq_listener_main(
                    None,
                    "serving compute connection task",
                    false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
+                    page_service_conn_main(
+                        conf,
+                        broker_client.clone(),
+                        local_auth,
+                        socket,
+                        auth_type,
+                        connection_ctx,
+                    ),
                );
            }
            Err(err) => {
@@ -230,6 +240,7 @@ pub async fn libpq_listener_main(

 async fn page_service_conn_main(
    conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<JwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
@@ -266,7 +277,7 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -324,6 +335,7 @@ impl PageRequestMetrics {

 struct PageServerHandler {
    _conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<JwtAuth>>,
    claims: Option<Claims>,

@@ -337,11 +349,13 @@ struct PageServerHandler {
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
+        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
+            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -494,7 +508,12 @@ impl PageServerHandler {

        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
        timeline
-            .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
+            .import_basebackup_from_tar(
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
            .await?;

        // Read the end of the tar archive.
@@ -1131,7 +1150,9 @@ enum GetActiveTenantError {
        wait_time: Duration,
    },
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
 }

 impl From<GetActiveTenantError> for QueryError {
@@ -1140,7 +1161,8 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::Other(e) => QueryError::Other(e),
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
        }
    }
 }
@@ -1156,13 +1178,16 @@ async fn get_active_tenant_with_timeout(
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
    let tenant = match mgr::get_tenant(tenant_id, false).await {
        Ok(tenant) => tenant,
-        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active=false")
+        }
    };
    let wait_time = Duration::from_secs(30);
    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
        Ok(Ok(())) => Ok(tenant),
        // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
        Err(_) => {
            let latest_state = tenant.current_state();
            if latest_state == TenantState::Active {
@@ -1177,13 +1202,34 @@ async fn get_active_tenant_with_timeout(
    }
 }

+#[derive(Debug, thiserror::Error)]
+enum GetActiveTimelineError {
+    #[error(transparent)]
+    Tenant(GetActiveTenantError),
+    #[error(transparent)]
+    Timeline(anyhow::Error),
+}
+
+impl From<GetActiveTimelineError> for QueryError {
+    fn from(e: GetActiveTimelineError) -> Self {
+        match e {
+            GetActiveTimelineError::Tenant(e) => e.into(),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
 async fn get_active_tenant_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTenantError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
-    let timeline = tenant.get_timeline(timeline_id, true)?;
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(GetActiveTimelineError::Timeline)?;
    Ok(timeline)
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -16,9 +16,11 @@ use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tracing::*;
+use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;

 use std::cmp::min;
@@ -238,6 +240,7 @@ impl UninitializedTimeline<'_> {
        self,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let raw_timeline = self.raw_timeline()?;
@@ -264,7 +267,7 @@ impl UninitializedTimeline<'_> {
        // updated it for the layers that we created during the import.
        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
-        tl.activate(ctx)?;
+        tl.activate(broker_client, ctx);
        Ok(tl)
    }

@@ -445,11 +448,44 @@ pub enum DeleteTimelineError {
    Other(#[from] anyhow::Error),
 }

+pub enum SetStoppingError {
+    AlreadyStopping,
+    Broken,
+}
+
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitToBecomeActiveError {
+    WillNotBecomeActive {
+        tenant_id: TenantId,
+        state: TenantState,
+    },
+    TenantDropped {
+        tenant_id: TenantId,
+    },
+}
+
+impl std::fmt::Display for WaitToBecomeActiveError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
+                write!(
+                    f,
+                    "Tenant {} will not become active. Current state: {:?}",
+                    tenant_id, state
+                )
+            }
+            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
+                write!(f, "Tenant {tenant_id} will not become active (dropped)")
+            }
+        }
+    }
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -585,6 +621,7 @@ impl Tenant {
    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
+        broker_client: storage_broker::BrokerClientChannel,
        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -614,16 +651,17 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
-                let doit = async {
-                    tenant_clone.attach(&ctx).await?;
-                    tenant_clone.activate(&ctx)?;
-                    anyhow::Ok(())
-                };
-                match doit.await {
-                    Ok(_) => {}
+                match tenant_clone.attach(&ctx).await {
+                    Ok(()) => {
+                        info!("attach finished, activating");
+                        tenant_clone.activate(broker_client, None, &ctx);
+                    }
                    Err(e) => {
-                        tenant_clone.set_broken(e.to_string());
-                        error!("error attaching tenant: {:?}", e);
+                        error!("attach failed, setting tenant state to Broken: {:?}", e);
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(e.to_string());
+                        });
                    }
                }
                Ok(())
@@ -640,6 +678,8 @@ impl Tenant {
    ///
    /// Background task that downloads all data for a tenant and brings it to Active state.
    ///
+    /// No background tasks are started as part of this routine.
+    ///
    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_id();

@@ -850,13 +890,17 @@ impl Tenant {
    /// If the loading fails for some reason, the Tenant will go into Broken
    /// state.
    ///
-    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
    pub fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
+        broker_client: storage_broker::BrokerClientChannel,
        remote_storage: Option<GenericRemoteStorage>,
+        init_done: Option<(completion::Completion, completion::Barrier)>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
+        debug_assert_current_span_has_tenant_id();
+
        let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
            Ok(conf) => conf,
            Err(e) => {
@@ -888,20 +932,27 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
-                let doit = async {
-                    tenant_clone.load(&ctx).await?;
-                    tenant_clone.activate(&ctx)?;
-                    anyhow::Ok(())
+                // keep the sender alive as long as we have the initial load ongoing; it will be
+                // None for loads spawned after init_tenant_mgr.
+                let (_tx, rx) = if let Some((tx, rx)) = init_done {
+                    (Some(tx), Some(rx))
+                } else {
+                    (None, None)
                };
-                match doit.await {
-                    Ok(()) => {}
+                match tenant_clone.load(&ctx).await {
+                    Ok(()) => {
+                        debug!("load finished, activating");
+                        tenant_clone.activate(broker_client, rx.as_ref(), &ctx);
+                    }
                    Err(err) => {
-                        tenant_clone.set_broken(err.to_string());
-                        error!("could not load tenant {tenant_id}: {err:?}");
+                        error!("load failed, setting tenant state to Broken: {err:?}");
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(err.to_string());
+                        });
                    }
                }
-                info!("initial load for tenant {tenant_id} finished!");
-                Ok(())
+               Ok(())
            }
            .instrument({
                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -910,8 +961,6 @@ impl Tenant {
            }),
        );

-        info!("spawned load into background");
-
        tenant
    }

@@ -919,10 +968,11 @@ impl Tenant {
    /// Background task to load in-memory data structures for this tenant, from
    /// files on disk. Used at pageserver startup.
    ///
+    /// No background tasks are started as part of this routine.
    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_id();

-        info!("loading tenant task");
+        debug!("loading tenant task");

        utils::failpoint_sleep_millis_async!("before-loading-tenant");

@@ -932,102 +982,109 @@ impl Tenant {
        //
        // Scan the directory, peek into the metadata file of each timeline, and
        // collect a list of timelines and their ancestors.
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
-        for entry in std::fs::read_dir(&timelines_dir).with_context(|| {
-            format!(
-                "Failed to list timelines directory for tenant {}",
-                self.tenant_id
-            )
-        })? {
-            let entry = entry.with_context(|| {
-                format!("cannot read timeline dir entry for {}", self.tenant_id)
-            })?;
-            let timeline_dir = entry.path();
+        let tenant_id = self.tenant_id;
+        let conf = self.conf;
+        let span = info_span!("blocking");

-            if crate::is_temporary(&timeline_dir) {
-                info!(
-                    "Found temporary timeline directory, removing: {}",
-                    timeline_dir.display()
-                );
-                if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
-                    error!(
-                        "Failed to remove temporary directory '{}': {:?}",
-                        timeline_dir.display(),
-                        e
+        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
+            let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
+            let timelines_dir = conf.timelines_path(&tenant_id);
+
+            for entry in
+                std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
+            {
+                let entry = entry.context("read timeline dir entry")?;
+                let timeline_dir = entry.path();
+
+                if crate::is_temporary(&timeline_dir) {
+                    info!(
+                        "Found temporary timeline directory, removing: {}",
+                        timeline_dir.display()
                    );
-                }
-            } else if is_uninit_mark(&timeline_dir) {
-                let timeline_uninit_mark_file = &timeline_dir;
-                info!(
-                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
-                    timeline_uninit_mark_file.display()
-                );
-                let timeline_id = timeline_uninit_mark_file
-                    .file_stem()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
-                        format!(
+                    if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
+                        error!(
+                            "Failed to remove temporary directory '{}': {:?}",
+                            timeline_dir.display(),
+                            e
+                        );
+                    }
+                } else if is_uninit_mark(&timeline_dir) {
+                    let timeline_uninit_mark_file = &timeline_dir;
+                    info!(
+                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                        timeline_uninit_mark_file.display()
+                    );
+                    let timeline_id = timeline_uninit_mark_file
+                        .file_stem()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
                            "Could not parse timeline id out of the timeline uninit mark name {}",
                            timeline_uninit_mark_file.display()
                        )
-                    })?;
-                let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-                if let Err(e) =
-                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
-                {
-                    error!("Failed to clean up uninit marked timeline: {e:?}");
-                }
-            } else {
-                let timeline_id = timeline_dir
-                    .file_name()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline dir name {}",
-                            timeline_dir.display()
-                        )
-                    })?;
-                let timeline_uninit_mark_file = self
-                    .conf
-                    .timeline_uninit_mark_file_path(self.tenant_id, timeline_id);
-                if timeline_uninit_mark_file.exists() {
-                    info!(
-                        "Found an uninit mark file for timeline {}/{}, removing the timeline and its uninit mark",
-                        self.tenant_id, timeline_id
-                    );
+                        })?;
+                    let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id);
                    if let Err(e) =
-                        remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file)
+                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
                    {
                        error!("Failed to clean up uninit marked timeline: {e:?}");
                    }
-                    continue;
-                }
-
-                let file_name = entry.file_name();
-                if let Ok(timeline_id) =
-                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
-                {
-                    let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
-                        .context("failed to load metadata")?;
-                    timelines_to_load.insert(timeline_id, metadata);
                } else {
-                    // A file or directory that doesn't look like a timeline ID
-                    warn!(
-                        "unexpected file or directory in timelines directory: {}",
-                        file_name.to_string_lossy()
-                    );
+                    let timeline_id = timeline_dir
+                        .file_name()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline dir name {}",
+                                timeline_dir.display()
+                            )
+                        })?;
+                    let timeline_uninit_mark_file =
+                        conf.timeline_uninit_mark_file_path(tenant_id, timeline_id);
+                    if timeline_uninit_mark_file.exists() {
+                        info!(
+                            %timeline_id,
+                            "Found an uninit mark file, removing the timeline and its uninit mark",
+                        );
+                        if let Err(e) = remove_timeline_and_uninit_mark(
+                            &timeline_dir,
+                            &timeline_uninit_mark_file,
+                        ) {
+                            error!("Failed to clean up uninit marked timeline: {e:?}");
+                        }
+                        continue;
+                    }
+
+                    let file_name = entry.file_name();
+                    if let Ok(timeline_id) =
+                        file_name.to_str().unwrap_or_default().parse::<TimelineId>()
+                    {
+                        let metadata = load_metadata(conf, timeline_id, tenant_id)
+                            .context("failed to load metadata")?;
+                        timelines_to_load.insert(timeline_id, metadata);
+                    } else {
+                        // A file or directory that doesn't look like a timeline ID
+                        warn!(
+                            "unexpected file or directory in timelines directory: {}",
+                            file_name.to_string_lossy()
+                        );
+                    }
                }
            }
-        }

-        // Sort the array of timeline IDs into tree-order, so that parent comes before
-        // all its children.
-        let sorted_timelines = tree_sort_timelines(timelines_to_load)?;
+            // Sort the array of timeline IDs into tree-order, so that parent comes before
+            // all its children.
+            tree_sort_timelines(timelines_to_load)
+        })
+        .await
+        .context("load spawn_blocking")
+        .and_then(|res| res)?;
+
        // FIXME original collect_timeline_files contained one more check:
        //    1. "Timeline has no ancestor and no layer files"

@@ -1037,7 +1094,7 @@ impl Tenant {
                .with_context(|| format!("load local timeline {timeline_id}"))?;
        }

-        info!("Done");
+        trace!("Done");

        Ok(())
    }
@@ -1234,6 +1291,7 @@ impl Tenant {
        ancestor_timeline_id: Option<TimelineId>,
        mut ancestor_start_lsn: Option<Lsn>,
        pg_version: u32,
+        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> anyhow::Result<Option<Arc<Timeline>>> {
        anyhow::ensure!(
@@ -1300,7 +1358,7 @@ impl Tenant {
            }
        };

-        loaded_timeline.activate(ctx).context("activate timeline")?;
+        loaded_timeline.activate(broker_client, ctx);

        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
@@ -1403,7 +1461,11 @@ impl Tenant {
        Ok(())
    }

-    /// Removes timeline-related in-memory data
+    /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
+    /// data from disk.
+    ///
+    /// This doesn't currently delete all data from S3, but sets a flag in its
+    /// index_part.json file to mark it as deleted.
    pub async fn delete_timeline(
        &self,
        timeline_id: TimelineId,
@@ -1413,7 +1475,11 @@ impl Tenant {

        // Transition the timeline into TimelineState::Stopping.
        // This should prevent new operations from starting.
-        let timeline = {
+        //
+        // Also grab the Timeline's delete_lock to prevent another deletion from starting.
+        let timeline;
+        let mut delete_lock_guard;
+        {
            let mut timelines = self.timelines.lock().unwrap();

            // Ensure that there are no child timelines **attached to that pageserver**,
@@ -1431,24 +1497,43 @@ impl Tenant {
                Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
            };

-            let timeline = Arc::clone(timeline_entry.get());
+            timeline = Arc::clone(timeline_entry.get());
+
+            // Prevent two tasks from trying to delete the timeline at the same time.
+            //
+            // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller
+            // needs to poll until the operation has finished. But for now, we return an
+            // error, because the control plane knows to retry errors.
+            delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| {
+                DeleteTimelineError::Other(anyhow::anyhow!(
+                    "timeline deletion is already in progress"
+                ))
+            })?;
+
+            // If another task finished the deletion just before we acquired the lock,
+            // return success.
+            if *delete_lock_guard {
+                return Ok(());
+            }
+
            timeline.set_state(TimelineState::Stopping);

            drop(timelines);
-            timeline
-        };
+        }

        // Now that the Timeline is in Stopping state, request all the related tasks to
        // shut down.
        //
-        // NB: If you call delete_timeline multiple times concurrently, they will
-        // all go through the motions here. Make sure the code here is idempotent,
-        // and don't error out if some of the shutdown tasks have already been
-        // completed!
+        // NB: If this fails half-way through, and is retried, the retry will go through
+        // all the same steps again. Make sure the code here is idempotent, and don't
+        // error out if some of the shutdown tasks have already been completed!

        // Stop the walreceiver first.
        debug!("waiting for wal receiver to shutdown");
-        timeline.walreceiver.stop().await;
+        let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+        if let Some(walreceiver) = maybe_started_walreceiver {
+            walreceiver.stop().await;
+        }
        debug!("wal receiver shutdown confirmed");

        // Prevent new uploads from starting.
@@ -1482,6 +1567,10 @@ impl Tenant {
                // If we (now, or already) marked it successfully as deleted, we can proceed
                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
                // Bail out otherwise
+                //
+                // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+                // two tasks from performing the deletion at the same time. The first task
+                // that starts deletion should run it to completion.
                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
@@ -1492,14 +1581,12 @@ impl Tenant {
        {
            // Grab the layer_removal_cs lock, and actually perform the deletion.
            //
-            // This lock prevents multiple concurrent delete_timeline calls from
-            // stepping on each other's toes, while deleting the files. It also
-            // prevents GC or compaction from running at the same time.
+            // This lock prevents prevents GC or compaction from running at the same time.
+            // The GC task doesn't register itself with the timeline it's operating on,
+            // so it might still be running even though we called `shutdown_tasks`.
            //
            // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. GC task doesn't
-            // register itself properly with the timeline it's
-            // operating on. See
+            // GC, compaction and timeline deletion. See
            // https://github.com/neondatabase/neon/issues/2671
            //
            // No timeout here, GC & Compaction should be responsive to the
@@ -1561,37 +1648,27 @@ impl Tenant {
        });

        // Remove the timeline from the map.
-        let mut timelines = self.timelines.lock().unwrap();
-        let children_exist = timelines
-            .iter()
-            .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-        // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-        // We already deleted the layer files, so it's probably best to panic.
-        // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-        if children_exist {
-            panic!("Timeline grew children while we removed layer files");
+        {
+            let mut timelines = self.timelines.lock().unwrap();
+
+            let children_exist = timelines
+                .iter()
+                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+            // We already deleted the layer files, so it's probably best to panic.
+            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+            if children_exist {
+                panic!("Timeline grew children while we removed layer files");
+            }
+
+            timelines.remove(&timeline_id).expect(
+                "timeline that we were deleting was concurrently removed from 'timelines' map",
+            );
        }
-        let removed_timeline = timelines.remove(&timeline_id);
-        if removed_timeline.is_none() {
-            // This can legitimately happen if there's a concurrent call to this function.
-            //   T1                                             T2
-            //   lock
-            //   unlock
-            //                                                  lock
-            //                                                  unlock
-            //                                                  remove files
-            //                                                  lock
-            //                                                  remove from map
-            //                                                  unlock
-            //                                                  return
-            //   remove files
-            //   lock
-            //   remove from map observes empty map
-            //   unlock
-            //   return
-            debug!("concurrent call to this function won the race");
-        }
-        drop(timelines);
+
+        // All done! Mark the deletion as completed and release the delete_lock
+        *delete_lock_guard = true;
+        drop(delete_lock_guard);

        Ok(())
    }
@@ -1605,147 +1682,196 @@ impl Tenant {
    }

    /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        init_done: Option<&completion::Barrier>,
+        ctx: &RequestContext,
+    ) {
        debug_assert_current_span_has_tenant_id();

-        let mut result = Ok(());
+        let mut activating = false;
        self.state.send_modify(|current_state| {
+            use pageserver_api::models::ActivatingFrom;
            match &*current_state {
-                TenantState::Active => {
-                    // activate() was called on an already Active tenant. Shouldn't happen.
-                    result = Err(anyhow::anyhow!("Tenant is already active"));
+                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => {
+                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                }
-                TenantState::Broken { reason, .. } => {
-                    // This shouldn't happen either
-                    result = Err(anyhow::anyhow!(
-                        "Could not activate tenant because it is in broken state due to: {reason}",
-                    ));
+                TenantState::Loading => {
+                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
                }
-                TenantState::Stopping => {
-                    // The tenant was detached, or system shutdown was requested, while we were
-                    // loading or attaching the tenant.
-                    info!("Tenant is already in Stopping state, skipping activation");
-                }
-                TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Active;
-
-                    debug!(tenant_id = %self.tenant_id, "Activating tenant");
-
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
-
-                    // Spawn gc and compaction loops. The loops will shut themselves
-                    // down when they notice that the tenant is inactive.
-                    tasks::start_background_loops(self.tenant_id);
-
-                    let mut activated_timelines = 0;
-                    let mut timelines_broken_during_activation = 0;
-
-                    for timeline in not_broken_timelines {
-                        match timeline
-                            .activate(ctx)
-                            .context("timeline activation for activating tenant")
-                        {
-                            Ok(()) => {
-                                activated_timelines += 1;
-                            }
-                            Err(e) => {
-                                error!(
-                                    "Failed to activate timeline {}: {:#}",
-                                    timeline.timeline_id, e
-                                );
-                                timeline.set_state(TimelineState::Broken);
-                                *current_state = TenantState::broken_from_reason(format!(
-                                    "failed to activate timeline {}: {}",
-                                    timeline.timeline_id, e
-                                ));
-
-                                timelines_broken_during_activation += 1;
-                            }
-                        }
-                    }
-
-                    let elapsed = self.loading_started_at.elapsed();
-                    let total_timelines = timelines_accessor.len();
-
-                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
-                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
-                    info!(
-                        since_creation_millis = elapsed.as_millis(),
-                        tenant_id = %self.tenant_id,
-                        activated_timelines,
-                        timelines_broken_during_activation,
-                        total_timelines,
-                        post_state = <&'static str>::from(&*current_state),
-                        "activation attempt finished"
-                    );
+                TenantState::Attaching => {
+                    *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                }
            }
+            debug!(tenant_id = %self.tenant_id, "Activating tenant");
+            activating = true;
+            // Continue outside the closure. We need to grab timelines.lock()
+            // and we plan to turn it into a tokio::sync::Mutex in a future patch.
        });
-        result
+
+        if activating {
+            let timelines_accessor = self.timelines.lock().unwrap();
+            let not_broken_timelines = timelines_accessor
+                .values()
+                .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+
+            // Spawn gc and compaction loops. The loops will shut themselves
+            // down when they notice that the tenant is inactive.
+            tasks::start_background_loops(self, init_done);
+
+            let mut activated_timelines = 0;
+
+            for timeline in not_broken_timelines {
+                timeline.activate(broker_client.clone(), ctx);
+                activated_timelines += 1;
+            }
+
+            self.state.send_modify(move |current_state| {
+                assert!(
+                    matches!(current_state, TenantState::Activating(_)),
+                    "set_stopping and set_broken wait for us to leave Activating state",
+                );
+                *current_state = TenantState::Active;
+
+                let elapsed = self.loading_started_at.elapsed();
+                let total_timelines = timelines_accessor.len();
+
+                // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                info!(
+                    since_creation_millis = elapsed.as_millis(),
+                    tenant_id = %self.tenant_id,
+                    activated_timelines,
+                    total_timelines,
+                    post_state = <&'static str>::from(&*current_state),
+                    "activation attempt finished"
+                );
+            });
+        }
    }

-    /// Change tenant status to Stopping, to mark that it is being shut down
-    pub fn set_stopping(&self) {
-        self.state.send_modify(|current_state| {
-            match current_state {
-                TenantState::Active | TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Stopping;
+    /// Change tenant status to Stopping, to mark that it is being shut down.
+    ///
+    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
+    ///
+    /// This function is not cancel-safe!
+    pub async fn set_stopping(&self) -> Result<(), SetStoppingError> {
+        let mut rx = self.state.subscribe();

-                    // FIXME: If the tenant is still Loading or Attaching, new timelines
-                    // might be created after this. That's harmless, as the Timelines
-                    // won't be accessible to anyone, when the Tenant is in Stopping
-                    // state.
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
-                    for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Stopping);
-                    }
-                }
-                TenantState::Broken { reason, .. } => {
-                    info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}");
-                }
-                TenantState::Stopping => {
-                    // The tenant was detached, or system shutdown was requested, while we were
-                    // loading or attaching the tenant.
-                    info!("Tenant is already in Stopping state");
-                }
+        // cannot stop before we're done activating, so wait out until we're done activating
+        rx.wait_for(|state| match state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
+                false
+            }
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
+        })
+        .await
+        .expect("cannot drop self.state while on a &self method");
+
+        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
+        let mut err = None;
+        let stopping = self.state.send_if_modified(|current_state| match current_state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+            }
+            TenantState::Active => {
+                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
+                // are created after the transition to Stopping. That's harmless, as the Timelines
+                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
+                *current_state = TenantState::Stopping;
+                // Continue stopping outside the closure. We need to grab timelines.lock()
+                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
+                true
+            }
+            TenantState::Broken { reason, .. } => {
+                info!(
+                    "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
+                );
+                err = Some(SetStoppingError::Broken);
+                false
+            }
+            TenantState::Stopping => {
+                info!("Tenant is already in Stopping state");
+                err = Some(SetStoppingError::AlreadyStopping);
+                false
            }
        });
+        match (stopping, err) {
+            (true, None) => {} // continue
+            (false, Some(err)) => return Err(err),
+            (true, Some(_)) => unreachable!(
+                "send_if_modified closure must error out if not transitioning to Stopping"
+            ),
+            (false, None) => unreachable!(
+                "send_if_modified closure must return true if transitioning to Stopping"
+            ),
+        }
+
+        let timelines_accessor = self.timelines.lock().unwrap();
+        let not_broken_timelines = timelines_accessor
+            .values()
+            .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+        for timeline in not_broken_timelines {
+            timeline.set_state(TimelineState::Stopping);
+        }
+        Ok(())
    }

-    pub fn set_broken(&self, reason: String) {
+    /// Method for tenant::mgr to transition us into Broken state in case of a late failure in
+    /// `remove_tenant_from_memory`
+    ///
+    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
+    ///
+    /// In tests, we also use this to set tenants to Broken state on purpose.
+    pub(crate) async fn set_broken(&self, reason: String) {
+        let mut rx = self.state.subscribe();
+
+        // The load & attach routines own the tenant state until it has reached `Active`.
+        // So, wait until it's done.
+        rx.wait_for(|state| match state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
+                false
+            }
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
+        })
+        .await
+        .expect("cannot drop self.state while on a &self method");
+
+        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
        self.state.send_modify(|current_state| {
            match *current_state {
+                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+                }
                TenantState::Active => {
-                    // Broken tenants can currently only used for fatal errors that happen
-                    // while loading or attaching a tenant. A tenant that has already been
-                    // activated should never be marked as broken. We cope with it the best
-                    // we can, but it shouldn't happen.
-                    warn!("Changing Active tenant to Broken state, reason: {}", reason);
-                    *current_state = TenantState::broken_from_reason(reason);
+                    if cfg!(feature = "testing") {
+                        warn!("Changing Active tenant to Broken state, reason: {}", reason);
+                        *current_state = TenantState::broken_from_reason(reason);
+                    } else {
+                        unreachable!("not allowed to call set_broken on Active tenants in non-testing builds")
+                    }
                }
                TenantState::Broken { .. } => {
-                    // This shouldn't happen either
                    warn!("Tenant is already in Broken state");
                }
+                // This is the only "expected" path, any other path is a bug.
                TenantState::Stopping => {
-                    // This shouldn't happen either
                    warn!(
                        "Marking Stopping tenant as Broken state, reason: {}",
                        reason
                    );
                    *current_state = TenantState::broken_from_reason(reason);
                }
-                TenantState::Loading | TenantState::Attaching => {
-                    info!("Setting tenant as Broken state, reason: {}", reason);
-                    *current_state = TenantState::broken_from_reason(reason);
-                }
-            }
+           }
        });
    }

@@ -1753,25 +1879,30 @@ impl Tenant {
        self.state.subscribe()
    }

-    pub async fn wait_to_become_active(&self) -> anyhow::Result<()> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
            match current_state {
-                TenantState::Loading | TenantState::Attaching => {
+                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
-                    receiver.changed().await?;
+                    receiver.changed().await.map_err(
+                        |_e: tokio::sync::watch::error::RecvError| {
+                            WaitToBecomeActiveError::TenantDropped {
+                                tenant_id: self.tenant_id,
+                            }
+                        },
+                    )?;
                }
                TenantState::Active { .. } => {
                    return Ok(());
                }
                TenantState::Broken { .. } | TenantState::Stopping => {
                    // There's no chance the tenant can transition back into ::Active
-                    anyhow::bail!(
-                        "Tenant {} will not become active. Current state: {:?}",
-                        self.tenant_id,
-                        &current_state,
-                    );
+                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
+                        tenant_id: self.tenant_id,
+                        state: current_state,
+                    });
                }
            }
        }
@@ -3078,6 +3209,7 @@ pub mod harness {
                evictions_low_residence_duration_metric_threshold: Some(
                    tenant_conf.evictions_low_residence_duration_metric_threshold,
                ),
+                gc_feedback: Some(tenant_conf.gc_feedback),
            }
        }
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -99,6 +99,7 @@ pub struct TenantConf {
    // See the corresponding metric's help string.
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
+    pub gc_feedback: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -175,6 +176,10 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    #[serde(default)]
    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub gc_feedback: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -242,6 +247,7 @@ impl TenantConfOpt {
            evictions_low_residence_duration_metric_threshold: self
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
+            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
        }
    }
 }
@@ -278,6 +284,7 @@ impl Default for TenantConf {
                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            gc_feedback: false,
        }
    }
 }
@@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
                    ))?,
            );
        }
+        tenant_conf.gc_feedback = request_data.gc_feedback;

        Ok(tenant_conf)
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -56,7 +56,6 @@ use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
-use tracing::*;

 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::Replacement;
@@ -276,14 +275,11 @@ where
    ///
    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
-        let key = historic_layer_coverage::LayerKey::from(&*layer);
-        if self.historic.contains(&key) {
-            error!(
-                "Attempt to insert duplicate layer {} in layer map",
-                layer.short_id()
-            );
-        }
-        self.historic.insert(key, Arc::clone(&layer));
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
+        );
+
        if Self::is_l0(&layer) {
            self.l0_delta_layers.push(layer);
        }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -204,6 +204,35 @@ fn test_off_by_one() {
    assert_eq!(version.image_coverage.query(5), None);
 }

+/// White-box regression test, checking for incorrect removal of node at key.end
+#[test]
+fn test_regression() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 0..5,
+            is_image: false,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 1..2,
+            is_image: false,
+        },
+        "Layer 2".to_string(),
+    );
+
+    // If an insertion operation improperly deletes the endpoint of a previous layer
+    // (which is more likely to happen with layers that collide on key.end), we will
+    // end up with an infinite layer, covering the entire keyspace. Here we assert
+    // that there's no layer at key 100 because we didn't insert any layer there.
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.delta_coverage.query(100), None);
+}
+
 /// Cover edge cases where layers begin or end on the same key
 #[test]
 fn test_key_collision() {
@@ -417,15 +446,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        }
    }

-    pub fn contains(&self, layer_key: &LayerKey) -> bool {
-        match self.buffer.get(layer_key) {
-            Some(None) => false,                         // layer remove was buffered
-            Some(_) => true,                             // layer insert was buffered
-            None => self.layers.contains_key(layer_key), // no buffered ops for this layer
-        }
-    }
-
-	pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
        self.buffer.insert(layer_key, Some(value));
    }

--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;

-// TODO the `im` crate has 20x more downloads and also has
-// persistent/immutable BTree. It also runs a bit faster but
-// results are not the same on some tests.
+// NOTE the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. But it's bugged so rpds is a
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;

 /// Data structure that can efficiently:
@@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync;
 /// - iterate the latest layers in a key range
 /// - insert layers in non-decreasing lsn.start order
 ///
-/// The struct is parameterized over Value for easier
-/// testing, but in practice it's some sort of layer.
+/// For a detailed explanation and justification of this approach, see:
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
+///
+/// NOTE The struct is parameterized over Value for easier
+///      testing, but in practice it's some sort of layer.
 pub struct LayerCoverage<Value> {
    /// For every change in coverage (as we sweep the key space)
    /// we store (lsn.end, value).
    ///
-    /// We use an immutable/persistent tree so that we can keep historic
-    /// versions of this coverage without cloning the whole thing and
-    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    /// NOTE We use an immutable/persistent tree so that we can keep historic
+    ///      versions of this coverage without cloning the whole thing and
+    ///      incurring quadratic memory cost. See HistoricLayerCoverage.
    ///
-    /// We use the Sync version of the map because we want Self to
-    /// be Sync. Using nonsync might be faster, if we can work with
-    /// that.
+    /// NOTE We use the Sync version of the map because we want Self to
+    ///      be Sync. Using nonsync might be faster, if we can work with
+    ///      that.
    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
 }

@@ -41,6 +44,13 @@ impl<Value: Clone> LayerCoverage<Value> {

    /// Helper function to subdivide the key range without changing any values
    ///
+    /// This operation has no semantic effect by itself. It only helps us pin in
+    /// place the part of the coverage we don't want to change when inserting.
+    ///
+    /// As an analogy, think of a polygon. If you add a vertex along one of the
+    /// segments, the polygon is still the same, but it behaves differently when
+    /// we move or delete one of the other points.
+    ///
    /// Complexity: O(log N)
    fn add_node(&mut self, key: i128) {
        let value = match self.nodes.range(..=key).last() {
@@ -74,7 +84,7 @@ impl<Value: Clone> LayerCoverage<Value> {
        let mut to_update = Vec::new();
        let mut to_remove = Vec::new();
        let mut prev_covered = false;
-        for (k, node) in self.nodes.range(key.clone()) {
+        for (k, node) in self.nodes.range(key) {
            let needs_cover = match node {
                None => true,
                Some((h, _)) => h < &lsn.end,
@@ -87,9 +97,8 @@ impl<Value: Clone> LayerCoverage<Value> {
            }
            prev_covered = needs_cover;
        }
-        if !prev_covered {
-            to_remove.push(key.end);
-        }
+        // TODO check if the nodes inserted at key.start and key.end are safe
+        //      to remove. It's fine to keep them but they could be redundant.
        for k in to_update {
            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
        }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -10,6 +10,7 @@ use tokio::fs;
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
+use tokio::task::JoinSet;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
@@ -19,9 +20,12 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
+};
 use crate::IGNORED_TENANT_FILE_NAME;

+use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

@@ -58,10 +62,12 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
-#[instrument(skip(conf, remote_storage))]
+#[instrument(skip_all)]
 pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_done: (completion::Completion, completion::Barrier),
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();
@@ -116,7 +122,9 @@ pub async fn init_tenant_mgr(
                    match schedule_local_tenant_processing(
                        conf,
                        &tenant_dir_path,
+                        broker_client.clone(),
                        remote_storage.clone(),
+                        Some(init_done.clone()),
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -150,7 +158,9 @@ pub async fn init_tenant_mgr(
 pub fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_done: Option<(completion::Completion, completion::Barrier)>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -186,7 +196,7 @@ pub fn schedule_local_tenant_processing(
    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
        if let Some(remote_storage) = remote_storage {
-            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+            match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -204,7 +214,14 @@ pub fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            broker_client,
+            remote_storage,
+            init_done,
+            ctx,
+        )
    };
    Ok(tenant)
 }
@@ -219,6 +236,7 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
+#[instrument]
 pub async fn shutdown_all_tenants() {
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
@@ -241,15 +259,65 @@ pub async fn shutdown_all_tenants() {
        }
    };

+    // Set tenant (and its timlines) to Stoppping state.
+    //
+    // Since we can only transition into Stopping state after activation is complete,
+    // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
+    //
+    // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
+    // 1. Lock out any new requests to the tenants.
+    // 2. Signal cancellation to WAL receivers (we wait on it below).
+    // 3. Signal cancellation for other tenant background loops.
+    // 4. ???
+    //
+    // The waiting for the cancellation is not done uniformly.
+    // We certainly wait for WAL receivers to shut down.
+    // That is necessary so that no new data comes in before the freeze_and_flush.
+    // But the tenant background loops are joined-on in our caller.
+    // It's mesed up.
+    let mut join_set = JoinSet::new();
    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
-    for (_, tenant) in tenants_to_shut_down {
-        if tenant.is_active() {
-            // updates tenant state, forbidding new GC and compaction iterations from starting
-            tenant.set_stopping();
-            tenants_to_freeze_and_flush.push(tenant);
+    for (tenant_id, tenant) in tenants_to_shut_down {
+        join_set.spawn(
+            async move {
+                match tenant.set_stopping().await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(SetStoppingError::Broken) => {
+                        info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
+                    },
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // our task_mgr::shutdown_tasks are going to coalesce on that just fine
+                    }
+                }
+
+                tenant
+            }
+            .instrument(info_span!("set_stopping", %tenant_id)),
+        );
+    }
+
+    let mut panicked = 0;
+
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
+            }
+            Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
        }
    }

+    if panicked > 0 {
+        warn!(panicked, "observed panicks while stopping tenants");
+    }
+
    // Shut down all existing walreceiver connections and stop accepting the new ones.
    task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;

@@ -261,12 +329,30 @@ pub async fn shutdown_all_tenants() {
    // should be no more activity in any of the repositories.
    //
    // On error, log it but continue with the shutdown for other tenants.
+
+    let mut join_set = tokio::task::JoinSet::new();
+
    for tenant in tenants_to_freeze_and_flush {
        let tenant_id = tenant.tenant_id();
-        debug!("shutdown tenant {tenant_id}");

-        if let Err(err) = tenant.freeze_and_flush().await {
-            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
+        join_set.spawn(
+            async move {
+                if let Err(err) = tenant.freeze_and_flush().await {
+                    warn!("Could not checkpoint tenant during shutdown: {err:?}");
+                }
+            }
+            .instrument(info_span!("freeze_and_flush", %tenant_id)),
+        );
+    }
+
+    while let Some(next) = join_set.join_next().await {
+        match next {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("no cancelling")
+            }
+            Err(join_error) if join_error.is_panic() => { /* reported already */ }
+            Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
        }
    }
 }
@@ -275,6 +361,7 @@ pub async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
@@ -287,7 +374,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -300,11 +387,19 @@ pub async fn create_tenant(
    }).await
 }

+#[derive(Debug, thiserror::Error)]
+pub enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+}
+
 pub async fn set_new_tenant_config(
    conf: &'static PageServerConf,
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
+) -> Result<(), SetNewTenantConfigError> {
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_id, true).await?;

@@ -314,23 +409,32 @@ pub async fn set_new_tenant_config(
        &tenant_config_path,
        new_tenant_conf,
        false,
-    )?;
+    )
+    .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
 }

+#[derive(Debug, thiserror::Error)]
+pub enum GetTenantError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+}
+
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
-) -> Result<Arc<Tenant>, TenantStateError> {
+) -> Result<Arc<Tenant>, GetTenantError> {
    let m = TENANTS.read().await;
    let tenant = m
        .get(&tenant_id)
-        .ok_or(TenantStateError::NotFound(tenant_id))?;
+        .ok_or(GetTenantError::NotFound(tenant_id))?;
    if active_only && !tenant.is_active() {
-        Err(TenantStateError::NotActive(tenant_id))
+        Err(GetTenantError::NotActive(tenant_id))
    } else {
        Ok(Arc::clone(tenant))
    }
@@ -339,7 +443,7 @@ pub async fn get_tenant(
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
-    Tenant(#[from] TenantStateError),
+    Tenant(#[from] GetTenantError),

    #[error("Timeline {0}")]
    Timeline(#[from] crate::tenant::DeleteTimelineError),
@@ -404,6 +508,7 @@ pub async fn detach_tenant(
 pub async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -415,7 +520,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -472,6 +577,7 @@ pub async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_conf: TenantConfOpt,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -487,7 +593,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -566,13 +672,23 @@ where
    {
        let tenants_accessor = TENANTS.write().await;
        match tenants_accessor.get(&tenant_id) {
-            Some(tenant) => match tenant.current_state() {
-                TenantState::Attaching
-                | TenantState::Loading
-                | TenantState::Broken { .. }
-                | TenantState::Active => tenant.set_stopping(),
-                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
-            },
+            Some(tenant) => {
+                let tenant = Arc::clone(tenant);
+                // don't hold TENANTS lock while set_stopping waits for activation to finish
+                drop(tenants_accessor);
+                match tenant.set_stopping().await {
+                    Ok(()) => {
+                        // we won, continue stopping procedure
+                    }
+                    Err(SetStoppingError::Broken) => {
+                        // continue the procedure, let's hope the closure can deal with broken tenants
+                    }
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // the tenant is already stopping or broken, don't do anything
+                        return Err(TenantStateError::IsStopping(tenant_id));
+                    }
+                }
+            }
            None => return Err(TenantStateError::NotFound(tenant_id)),
        }
    }
@@ -597,7 +713,7 @@ where
            let tenants_accessor = TENANTS.read().await;
            match tenants_accessor.get(&tenant_id) {
                Some(tenant) => {
-                    tenant.set_broken(e.to_string());
+                    tenant.set_broken(e.to_string()).await;
                }
                None => {
                    warn!("Tenant {tenant_id} got removed from memory");
@@ -663,7 +779,6 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-#[cfg(feature = "testing")]
 pub async fn immediate_compact(
    tenant_id: TenantId,
    timeline_id: TimelineId,
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
    Ok(())
 }

-pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
-    const PARALLEL_PATH_THRESHOLD: usize = 1;
-    if paths.len() <= PARALLEL_PATH_THRESHOLD {
-        for path in paths {
-            fsync_path(path)?;
-        }
-        return Ok(());
-    }
+fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
+    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.

    /// Use at most this number of threads.
    /// Increasing this limit will
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
    let num_threads = paths.len().min(MAX_NUM_THREADS);
    let next_path_idx = AtomicUsize::new(0);

-    crossbeam_utils::thread::scope(|s| -> io::Result<()> {
+    std::thread::scope(|s| -> io::Result<()> {
        let mut handles = vec![];
        // Spawn `num_threads - 1`, as the current thread is also a worker.
        for _ in 1..num_threads {
-            handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
+            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
        }

        parallel_worker(paths, &next_path_idx)?;
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {

        Ok(())
    })
-    .unwrap()
+}
+
+/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
+pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+    if paths.len() == 1 {
+        fsync_path(&paths[0])?;
+        return Ok(());
+    }
+
+    fsync_in_thread_pool(paths)
+}
+
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
+    const MAX_CONCURRENT_FSYNC: usize = 64;
+    let mut next = paths.iter().peekable();
+    let mut js = tokio::task::JoinSet::new();
+    loop {
+        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
+            let next = next.next().expect("just peeked");
+            let next = next.to_owned();
+            js.spawn_blocking(move || fsync_path(&next));
+        }
+
+        // now the joinset has been filled up, wait for next to complete
+        if let Some(res) = js.join_next().await {
+            res??;
+        } else {
+            // last item had already completed
+            assert!(
+                next.peek().is_none(),
+                "joinset emptied, we shouldn't have more work"
+            );
+            return Ok(());
+        }
+    }
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,6 +4,7 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
@@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
+pub use layer_desc::PersistentLayerDesc;
 pub use remote_layer::RemoteLayer;

 use super::layer_map::BatchedUpdates;
@@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer {
-    fn get_tenant_id(&self) -> TenantId;
+    /// Get the layer descriptor.
+    fn layer_desc(&self) -> &PersistentLayerDesc;
+
+    fn get_tenant_id(&self) -> TenantId {
+        self.layer_desc().tenant_id
+    }

    /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId;
+    fn get_timeline_id(&self) -> TimelineId {
+        self.layer_desc().timeline_id
+    }

    /// File name used for this layer, both in the pageserver's local filesystem
    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName;
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }

    // Path to the layer file in the local filesystem.
    // `None` for `RemoteLayer`.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -56,8 +56,8 @@ use utils::{
 };

 use super::{
-    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
-    LayerKeyIter, PathOrConf,
+    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    PathOrConf, PersistentLayerDesc,
 };

 ///
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,

-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
-            lsn_range: layer.lsn_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn_range: layer.desc.lsn_range.clone(),

            index_start_blk: 0,
            index_root_blk: 0,
@@ -180,10 +180,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path_or_conf: PathOrConf,

-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
+    pub desc: PersistentLayerDesc,

    pub file_size: u64,

@@ -197,8 +194,8 @@ impl std::fmt::Debug for DeltaLayer {
        use super::RangeDisplayDebug;

        f.debug_struct("DeltaLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
-            .field("lsn_range", &self.lsn_range)
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
+            .field("lsn_range", &self.desc.lsn_range)
            .field("file_size", &self.file_size)
            .field("inner", &self.inner)
            .finish()
@@ -228,30 +225,16 @@ impl std::fmt::Debug for DeltaLayerInner {
 }

 impl Layer for DeltaLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
        );

        if !verbose {
@@ -324,10 +307,10 @@ impl Layer for DeltaLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.lsn_range.start);
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
        let mut need_image = true;

-        ensure!(self.key_range.contains(&key));
+        ensure!(self.desc.key_range.contains(&key));

        {
            // Open the file and lock the metadata in memory
@@ -402,19 +385,31 @@ impl Layer for DeltaLayer {
            Ok(ValueReconstructResult::Complete)
        }
    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }

 impl PersistentLayer for DeltaLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
-
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
@@ -602,10 +597,12 @@ impl DeltaLayer {
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
-            lsn_range: filename.lsn_range.clone(),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+            ),
            file_size,
            access_stats,
            inner: RwLock::new(DeltaLayerInner {
@@ -632,10 +629,12 @@ impl DeltaLayer {

        Ok(DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
-            lsn_range: summary.lsn_range,
+            desc: PersistentLayerDesc::new_delta(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn_range,
+            ),
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(DeltaLayerInner {
@@ -648,18 +647,14 @@ impl DeltaLayer {
    }

    fn layer_name(&self) -> DeltaFileName {
-        DeltaFileName {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-        }
+        self.desc.delta_file_name()
    }
-
    /// Path to the layer file in pageserver workdir.
    pub fn path(&self) -> PathBuf {
        Self::path_for(
            &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
            &self.layer_name(),
        )
    }
@@ -803,10 +798,12 @@ impl DeltaLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.
        let layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
-            tenant_id: self.tenant_id,
-            timeline_id: self.timeline_id,
-            key_range: self.key_start..key_end,
-            lsn_range: self.lsn_range.clone(),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+            ),
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(DeltaLayerInner {
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -9,6 +9,8 @@ use std::str::FromStr;

 use utils::lsn::Lsn;

+use super::PersistentLayerDesc;
+
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
 pub struct DeltaFileName {
@@ -153,7 +155,7 @@ impl Ord for ImageFileName {
 impl ImageFileName {
    pub fn lsn_as_range(&self) -> Range<Lsn> {
        // Saves from having to copypaste this all over
-        self.lsn..(self.lsn + 1)
+        PersistentLayerDesc::image_layer_lsn_range(self.lsn)
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -52,8 +52,8 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::{ImageFileName, LayerFileName};
-use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
+use super::filename::ImageFileName;
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
            lsn: layer.lsn,

            index_start_blk: 0,
@@ -104,14 +104,13 @@ impl From<&ImageLayer> for Summary {
 /// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
    path_or_conf: PathOrConf,
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub file_size: u64,

-    // This entry contains an image of all pages as of this LSN
+    pub desc: PersistentLayerDesc,
+    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,

+    pub file_size: u64,
+
    access_stats: LayerAccessStats,

    inner: RwLock<ImageLayerInner>,
@@ -122,7 +121,7 @@ impl std::fmt::Debug for ImageLayer {
        use super::RangeDisplayDebug;

        f.debug_struct("ImageLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
            .field("file_size", &self.file_size)
            .field("lsn", &self.lsn)
            .field("inner", &self.inner)
@@ -153,27 +152,15 @@ impl std::fmt::Debug for ImageLayerInner {
 }

 impl Layer for ImageLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} ----",
-            self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn
        );

        if !verbose {
@@ -203,7 +190,7 @@ impl Layer for ImageLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.key_range.contains(&key));
+        assert!(self.desc.key_range.contains(&key));
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

@@ -230,24 +217,37 @@ impl Layer for ImageLayer {
            Ok(ValueReconstructResult::Missing)
        }
    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }

 impl PersistentLayer for ImageLayer {
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
        Some(self.path())
    }

-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
        unimplemented!();
    }
@@ -405,9 +405,13 @@ impl ImageLayer {
    ) -> ImageLayer {
        ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                false,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            file_size,
            access_stats,
@@ -433,9 +437,13 @@ impl ImageLayer {
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
+            desc: PersistentLayerDesc::new_img(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn,
+                false,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -449,18 +457,15 @@ impl ImageLayer {
    }

    fn layer_name(&self) -> ImageFileName {
-        ImageFileName {
-            key_range: self.key_range.clone(),
-            lsn: self.lsn,
-        }
+        self.desc.image_file_name()
    }

    /// Path to the layer file in pageserver workdir.
    pub fn path(&self) -> PathBuf {
        Self::path_for(
            &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
            &self.layer_name(),
        )
    }
@@ -484,6 +489,7 @@ struct ImageLayerWriterInner {
    tenant_id: TenantId,
    key_range: Range<Key>,
    lsn: Lsn,
+    is_incremental: bool,

    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -499,6 +505,7 @@ impl ImageLayerWriterInner {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        is_incremental: bool,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
@@ -533,6 +540,7 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
+            is_incremental,
        };

        Ok(writer)
@@ -570,6 +578,14 @@ impl ImageLayerWriterInner {
            file.write_all(buf.as_ref())?;
        }

+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            self.is_incremental, // for now, image layer ALWAYS covers the full range
+        );
+
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
@@ -593,9 +609,7 @@ impl ImageLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.
        let layer = ImageLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
-            timeline_id: self.timeline_id,
-            tenant_id: self.tenant_id,
-            key_range: self.key_range.clone(),
+            desc,
            lsn: self.lsn,
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -667,6 +681,7 @@ impl ImageLayerWriter {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        is_incremental: bool,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(ImageLayerWriterInner::new(
@@ -675,6 +690,7 @@ impl ImageLayerWriter {
                tenant_id,
                key_range,
                lsn,
+                is_incremental,
            )?),
        })
    }
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -0,0 +1,109 @@
+use std::ops::Range;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::repository::Key;
+
+use super::{DeltaFileName, ImageFileName, LayerFileName};
+
+/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
+/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// a unified way to generate layer information like file name.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PersistentLayerDesc {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key_range: Range<Key>,
+    /// For image layer, this is `[lsn, lsn+1)`.
+    pub lsn_range: Range<Lsn>,
+    /// Whether this is a delta layer.
+    pub is_delta: bool,
+    /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
+    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
+    /// incremental.
+    pub is_incremental: bool,
+}
+
+impl PersistentLayerDesc {
+    pub fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+
+    pub fn new_img(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+        is_incremental: bool,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range: Self::image_layer_lsn_range(lsn),
+            is_delta: false,
+            is_incremental,
+        }
+    }
+
+    pub fn new_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range,
+            is_delta: true,
+            is_incremental: true,
+        }
+    }
+
+    /// Get the LSN that the image layer covers.
+    pub fn image_layer_lsn(&self) -> Lsn {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        self.lsn_range.start
+    }
+
+    /// Get the LSN range corresponding to a single image layer LSN.
+    pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
+        lsn..(lsn + 1)
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not a delta layer.
+    pub fn delta_file_name(&self) -> DeltaFileName {
+        assert!(self.is_delta);
+        DeltaFileName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        }
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not an image layer, or the lsn range is invalid
+    pub fn image_file_name(&self) -> ImageFileName {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        ImageFileName {
+            key_range: self.key_range.clone(),
+            lsn: self.lsn_range.start,
+        }
+    }
+
+    pub fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            self.delta_file_name().into()
+        } else {
+            self.image_file_name().into()
+        }
+    }
+}
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -18,11 +18,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
-use super::image_layer::ImageLayer;
+use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
-    LayerResidenceStatus, PersistentLayer,
+    DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -34,19 +33,10 @@ use super::{
 ///
 /// See: [`crate::context::RequestContext`] for authorization to download
 pub struct RemoteLayer {
-    tenantid: TenantId,
-    timelineid: TimelineId,
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-
-    pub file_name: LayerFileName,
+    pub desc: PersistentLayerDesc,

    pub layer_metadata: LayerFileMetadata,

-    is_delta: bool,
-
-    is_incremental: bool,
-
    access_stats: LayerAccessStats,

    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
@@ -66,22 +56,14 @@ pub struct RemoteLayer {
 impl std::fmt::Debug for RemoteLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.file_name)
+            .field("file_name", &self.desc.filename())
            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.is_incremental)
+            .field("is_incremental", &self.desc.is_incremental)
            .finish()
    }
 }

 impl Layer for RemoteLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-
    fn get_value_reconstruct_data(
        &self,
        _key: Key,
@@ -95,53 +77,45 @@ impl Layer for RemoteLayer {
        );
    }

-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenantid,
-            self.timelineid,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
        );

        Ok(())
    }

+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
    fn short_id(&self) -> String {
-        self.filename().file_name()
+        self.layer_desc().short_id()
    }
 }

 impl PersistentLayer for RemoteLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenantid
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timelineid
-    }
-
-    fn filename(&self) -> LayerFileName {
-        if self.is_delta {
-            DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            }
-            .into()
-        } else {
-            ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            }
-            .into()
-        }
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
@@ -176,7 +150,7 @@ impl PersistentLayer for RemoteLayer {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();

-        if self.is_delta {
+        if self.desc.is_delta {
            HistoricLayerInfo::Delta {
                layer_file_name,
                layer_file_size: self.layer_metadata.file_size(),
@@ -210,13 +184,13 @@ impl RemoteLayer {
        access_stats: LayerAccessStats,
    ) -> RemoteLayer {
        RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_as_range(),
-            is_delta: false,
-            is_incremental: false,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                false,
+            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -232,13 +206,12 @@ impl RemoteLayer {
        access_stats: LayerAccessStats,
    ) -> RemoteLayer {
        RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_range.clone(),
-            is_delta: true,
-            is_incremental: true,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -256,15 +229,12 @@ impl RemoteLayer {
    where
        L: ?Sized + Layer,
    {
-        if self.is_delta {
-            let fname = DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            };
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
                conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                &fname,
                file_size,
                self.access_stats.clone_for_residence_change(
@@ -273,14 +243,11 @@ impl RemoteLayer {
                ),
            ))
        } else {
-            let fname = ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            };
+            let fname = self.desc.image_file_name();
            Arc::new(ImageLayer::new(
                conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                &fname,
                file_size,
                self.access_stats.clone_for_residence_change(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,13 +9,13 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::TenantId;
+use utils::completion;

-pub fn start_background_loops(tenant_id: TenantId) {
+pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
+    let tenant_id = tenant.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
@@ -23,11 +23,16 @@ pub fn start_background_loops(tenant_id: TenantId) {
        None,
        &format!("compactor for tenant {tenant_id}"),
        false,
-        async move {
-            compaction_loop(tenant_id)
-                .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            let init_done = init_done.cloned();
+            async move {
+                completion::Barrier::maybe_wait(init_done).await;
+                compaction_loop(tenant)
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
        },
    );
    task_mgr::spawn(
@@ -37,11 +42,16 @@ pub fn start_background_loops(tenant_id: TenantId) {
        None,
        &format!("garbage collector for tenant {tenant_id}"),
        false,
-        async move {
-            gc_loop(tenant_id)
-                .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            let init_done = init_done.cloned();
+            async move {
+                completion::Barrier::maybe_wait(init_done).await;
+                gc_loop(tenant)
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
        },
    );
 }
@@ -49,7 +59,7 @@ pub fn start_background_loops(tenant_id: TenantId) {
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant_id: TenantId) {
+async fn compaction_loop(tenant: Arc<Tenant>) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
@@ -60,16 +70,16 @@ async fn compaction_loop(tenant_id: TenantId) {
        loop {
            trace!("waking up");

-            let tenant = tokio::select! {
+            tokio::select! {
                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                },
-            };
+            }

            let period = tenant.get_compaction_period();

@@ -119,7 +129,7 @@ async fn compaction_loop(tenant_id: TenantId) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant_id: TenantId) {
+async fn gc_loop(tenant: Arc<Tenant>) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
@@ -127,21 +137,22 @@ async fn gc_loop(tenant_id: TenantId) {
        let cancel = task_mgr::shutdown_token();
        // GC might require downloading, to find the cutoff LSN that corresponds to the
        // cutoff specified as time.
-        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let ctx =
+            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
            trace!("waking up");

-            let tenant = tokio::select! {
+            tokio::select! {
                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                },
-            };
+            }

            let period = tenant.get_gc_period();

@@ -161,7 +172,9 @@ async fn gc_loop(tenant_id: TenantId) {
                Duration::from_secs(10)
            } else {
                // Run gc
-                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                let res = tenant
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .await;
                if let Err(e) = res {
                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
@@ -187,23 +200,10 @@ async fn gc_loop(tenant_id: TenantId) {
    trace!("GC loop stopped.");
 }

-async fn wait_for_active_tenant(
-    tenant_id: TenantId,
-    wait: Duration,
-) -> ControlFlow<(), Arc<Tenant>> {
-    let tenant = loop {
-        match mgr::get_tenant(tenant_id, false).await {
-            Ok(tenant) => break tenant,
-            Err(e) => {
-                error!("Failed to get a tenant {tenant_id}: {e:#}");
-                tokio::time::sleep(wait).await;
-            }
-        }
-    };
-
+async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
    // if the tenant has a proper status already, no need to wait for anything
    if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(tenant)
+        ControlFlow::Continue(())
    } else {
        let mut tenant_state_updates = tenant.subscribe_for_state_updates();
        loop {
@@ -213,7 +213,7 @@ async fn wait_for_active_tenant(
                    match new_state {
                        TenantState::Active => {
                            debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(tenant);
+                            return ControlFlow::Continue(());
                        }
                        state => {
                            debug!("Not running the task loop, tenant is not active: {state:?}");
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -31,7 +31,6 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::broker_client::{get_broker_client, is_broker_client_initialized};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
@@ -196,8 +195,9 @@ pub struct Timeline {
    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`].
-    pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
+    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
    pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -227,7 +227,7 @@ pub struct Timeline {
    /// or None if WAL receiver has not received anything for this timeline
    /// yet.
    pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
-    pub walreceiver: WalReceiver,
+    pub walreceiver: Mutex<Option<WalReceiver>>,

    /// Relation size cache
    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
@@ -236,6 +236,10 @@ pub struct Timeline {

    state: watch::Sender<TimelineState>,

+    /// Prevent two tasks from deleting the timeline at the same time. If held, the
+    /// timeline is being deleted. If 'true', the timeline has already been deleted.
+    pub delete_lock: tokio::sync::Mutex<bool>,
+
    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 }

@@ -521,7 +525,12 @@ impl Timeline {
            Some((cached_lsn, cached_img)) => {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    Ordering::Equal => {
+                        self.metrics
+                            .materialized_page_cache_hit_upon_request_counter
+                            .inc();
+                        return Ok(cached_img); // exact LSN match, return the image
+                    }
                    Ordering::Greater => {
                        unreachable!("the returned lsn should never be after the requested lsn")
                    }
@@ -536,8 +545,10 @@ impl Timeline {
            img: cached_page_img,
        };

+        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
+        timer.stop_and_record();

        self.metrics
            .reconstruct_time_histo
@@ -622,17 +633,27 @@ impl Timeline {
            .await
        {
            Ok(()) => Ok(()),
-            seqwait_error => {
+            Err(e) => {
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
                drop(_timer);
-                let walreceiver_status = self.walreceiver.status().await;
-                seqwait_error.with_context(|| format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
-                    lsn,
-                    self.get_last_record_lsn(),
-                    self.get_disk_consistent_lsn(),
-                    walreceiver_status.map(|status| status.to_human_readable_string())
-                            .unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
-                ))
+                let walreceiver_status = {
+                    match &*self.walreceiver.lock().unwrap() {
+                        None => "stopping or stopped".to_string(),
+                        Some(walreceiver) => match walreceiver.status() {
+                            Some(status) => status.to_human_readable_string(),
+                            None => "Not active".to_string(),
+                        },
+                    }
+                };
+                Err(anyhow::Error::new(e).context({
+                    format!(
+                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
+                        lsn,
+                        self.get_last_record_lsn(),
+                        self.get_disk_consistent_lsn(),
+                        walreceiver_status,
+                    )
+                }))
            }
        }
    }
@@ -660,7 +681,7 @@ impl Timeline {
    }

    /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

        let last_record_lsn = self.get_last_record_lsn();
@@ -749,7 +770,7 @@ impl Timeline {
    }

    /// Compaction which might need to be retried after downloading remote layers.
-    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
+    async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
        //
        // High level strategy for compaction / image creation:
        //
@@ -784,7 +805,7 @@ impl Timeline {
        // Below are functions compact_level0() and create_image_layers()
        // but they are a bit ad hoc and don't quite work like it's explained
        // above. Rewrite it.
-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        let state = *self.state.borrow();
        if state == TimelineState::Stopping {
@@ -818,7 +839,7 @@ impl Timeline {

                // 3. Compact
                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(&layer_removal_cs, target_file_size, ctx)
+                self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
                    .await?;
                timer.stop_and_record();
            }
@@ -907,18 +928,10 @@ impl Timeline {
        Ok(())
    }

-    pub fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
-        if is_broker_client_initialized() {
-            self.launch_wal_receiver(ctx, get_broker_client().clone())?;
-        } else if cfg!(test) {
-            info!("not launching WAL receiver because broker client hasn't been initialized");
-        } else {
-            anyhow::bail!("broker client not initialized");
-        }
-
+    pub fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
        self.launch_eviction_task();
-        Ok(())
    }

    pub fn set_state(&self, new_state: TimelineState) {
@@ -1285,6 +1298,13 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

+    fn get_gc_feedback(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .gc_feedback
+            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
+    }
+
    pub(super) fn tenant_conf_updated(&self) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -1327,15 +1347,7 @@ impl Timeline {
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

        let tenant_conf_guard = tenant_conf.read().unwrap();
-        let wal_connect_timeout = tenant_conf_guard
-            .walreceiver_connect_timeout
-            .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
-            .lagging_wal_timeout
-            .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
-            .max_lsn_wal_lag
-            .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag);
+
        let evictions_low_residence_duration_metric_threshold =
            Self::get_evictions_low_residence_duration_metric_threshold(
                &tenant_conf_guard,
@@ -1344,18 +1356,6 @@ impl Timeline {
        drop(tenant_conf_guard);

        Arc::new_cyclic(|myself| {
-            let walreceiver = WalReceiver::new(
-                TenantTimelineId::new(tenant_id, timeline_id),
-                Weak::clone(myself),
-                WalReceiverConf {
-                    wal_connect_timeout,
-                    lagging_wal_timeout,
-                    max_lsn_wal_lag,
-                    auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
-                    availability_zone: conf.availability_zone.clone(),
-                },
-            );
-
            let mut result = Timeline {
                conf,
                tenant_conf,
@@ -1367,7 +1367,7 @@ impl Timeline {
                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
-                walreceiver,
+                walreceiver: Mutex::new(None),

                remote_client: remote_client.map(Arc::new),

@@ -1432,6 +1432,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
+                delete_lock: tokio::sync::Mutex::new(false),
            };
            result.repartition_threshold = result.get_checkpoint_distance() / 10;
            result
@@ -1487,17 +1488,49 @@ impl Timeline {
        *flush_loop_state = FlushLoopState::Running;
    }

-    pub(super) fn launch_wal_receiver(
-        &self,
+    /// Creates and starts the wal receiver.
+    ///
+    /// This function is expected to be called at most once per Timeline's lifecycle
+    /// when the timeline is activated.
+    fn launch_wal_receiver(
+        self: &Arc<Self>,
        ctx: &RequestContext,
        broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
+    ) {
        info!(
            "launching WAL receiver for timeline {} of tenant {}",
            self.timeline_id, self.tenant_id
        );
-        self.walreceiver.start(ctx, broker_client)?;
-        Ok(())
+
+        let tenant_conf_guard = self.tenant_conf.read().unwrap();
+        let wal_connect_timeout = tenant_conf_guard
+            .walreceiver_connect_timeout
+            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
+        let lagging_wal_timeout = tenant_conf_guard
+            .lagging_wal_timeout
+            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
+        let max_lsn_wal_lag = tenant_conf_guard
+            .max_lsn_wal_lag
+            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
+        drop(tenant_conf_guard);
+
+        let mut guard = self.walreceiver.lock().unwrap();
+        assert!(
+            guard.is_none(),
+            "multiple launches / re-launches of WAL receiver are not supported"
+        );
+        *guard = Some(WalReceiver::start(
+            Arc::clone(self),
+            WalReceiverConf {
+                wal_connect_timeout,
+                lagging_wal_timeout,
+                max_lsn_wal_lag,
+                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+                availability_zone: self.conf.availability_zone.clone(),
+            },
+            broker_client,
+            ctx,
+        ));
    }

    ///
@@ -1891,6 +1924,7 @@ impl Timeline {
                // no cancellation here, because nothing really waits for this to complete compared
                // to spawn_ondemand_logical_size_calculation.
                let cancel = CancellationToken::new();
+
                let calculated_size = match self_clone
                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await
@@ -2155,7 +2189,7 @@ impl Timeline {
    fn delete_historic_layer(
        &self,
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
    ) -> anyhow::Result<()> {
@@ -2234,6 +2268,9 @@ impl Timeline {
        let mut timeline_owned;
        let mut timeline = self;

+        let mut read_count =
+            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
+
        // For debugging purposes, collect the path of layers that we traversed
        // through. It's included in the error message if we fail to find the key.
        let mut traversal_path = Vec::<TraversalPathItem>::new();
@@ -2368,6 +2405,7 @@ impl Timeline {
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
+                            // metrics: open_layer does not count as fs access, so we are not updating `read_count`
                            traversal_path.push((
                                result,
                                cont_lsn,
@@ -2394,6 +2432,7 @@ impl Timeline {
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
+                            // metrics: open_layer does not count as fs access, so we are not updating `read_count`
                            traversal_path.push((
                                result,
                                cont_lsn,
@@ -2428,6 +2467,7 @@ impl Timeline {
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
+                            *read_count += 1;
                            traversal_path.push((
                                result,
                                cont_lsn,
@@ -2493,7 +2533,7 @@ impl Timeline {
                    (DownloadBehavior::Error, false) => {
                        return Err(PageReconstructError::NeedsDownload(
                            TenantTimelineId::new(self.tenant_id, self.timeline_id),
-                            remote_layer.file_name.clone(),
+                            remote_layer.filename(),
                        ))
                    }
                }
@@ -2619,7 +2659,7 @@ impl Timeline {

    /// Layer flusher task's main loop.
    async fn flush_loop(
-        &self,
+        self: &Arc<Self>,
        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
        ctx: &RequestContext,
    ) {
@@ -2708,9 +2748,9 @@ impl Timeline {
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
    async fn flush_frozen_layer(
-        &self,
+        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -2730,7 +2770,16 @@ impl Timeline {
                    .await?
            } else {
                // normal case, write out a L0 delta layer file.
-                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
+                let this = self.clone();
+                let frozen_layer = frozen_layer.clone();
+                let span = tracing::info_span!("blocking");
+                let (delta_path, metadata) = tokio::task::spawn_blocking(move || {
+                    let _g = span.entered();
+                    this.create_delta_layer(&frozen_layer)
+                })
+                .await
+                .context("create_delta_layer spawn_blocking")
+                .and_then(|res| res)?;
                HashMap::from([(delta_path, metadata)])
            };

@@ -2834,7 +2883,7 @@ impl Timeline {

    // Write out the given frozen in-memory layer as a new L0 delta file
    fn create_delta_layer(
-        &self,
+        self: &Arc<Self>,
        frozen_layer: &InMemoryLayer,
    ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
        // Write it out
@@ -2850,10 +2899,13 @@ impl Timeline {
        // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
        // files to flush, it might be better to first write them all, and then fsync
        // them all in parallel.
-        par_fsync::par_fsync(&[
-            new_delta_path.clone(),
-            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-        ])?;
+
+        // First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
+        // this with a single fsync in future refactors.
+        par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
+        // Then sync the parent directory.
+        par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .context("fsync of timeline dir")?;

        // Add it to the layer map
        let l = Arc::new(new_delta);
@@ -3014,6 +3066,7 @@ impl Timeline {
                    self.tenant_id,
                    &img_range,
                    lsn,
+                    false, // image layer always covers the full range
                )?;

                fail_point!("image-layer-writer-fail-before-finish", |_| {
@@ -3077,17 +3130,22 @@ impl Timeline {
        let all_paths = image_layers
            .iter()
            .map(|layer| layer.path())
-            .chain(std::iter::once(
-                self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-            ))
            .collect::<Vec<_>>();
-        par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&all_paths)
+            .await
+            .context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .await
+            .context("fsync of timeline dir")?;

        let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());

        let mut layers = self.layers.write().unwrap();
        let mut updates = layers.batch_update();
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+
        for l in image_layers {
            let path = l.filename();
            let metadata = timeline_path
@@ -3146,9 +3204,9 @@ impl Timeline {
    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
    /// start of level0 files compaction, the on-demand download should be revisited as well.
-    async fn compact_level0_phase1(
+    fn compact_level0_phase1(
        &self,
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
@@ -3461,22 +3519,18 @@ impl Timeline {
        if !new_layers.is_empty() {
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

-            // also sync the directory
-            layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
-
            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;

+            par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+                .context("fsync of timeline dir")?;
+
            layer_paths.pop().unwrap();
        }

        drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed

-        fail_point!("compact-level0-phase1-finish", |_| {
-            Err(anyhow::anyhow!("failpoint compact-level0-phase1-finish").into())
-        });
-
        Ok(CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
@@ -3488,17 +3542,26 @@ impl Timeline {
    /// as Level 1 files.
    ///
    async fn compact_level0(
-        &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        self: &Arc<Self>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
+        let this = self.clone();
+        let ctx_inner = ctx.clone();
+        let layer_removal_cs_inner = layer_removal_cs.clone();
+        let span = tracing::info_span!("blocking");
        let CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
-        } = self
-            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
-            .await?;
+        } = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
+            this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
+        })
+        .await
+        .context("compact_level0_phase1 spawn_blocking")
+        .map_err(CompactionError::Other)
+        .and_then(|res| res)?;

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
            // nothing to do
@@ -3556,7 +3619,7 @@ impl Timeline {
        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
        for l in deltas_to_compact {
            layer_names_to_delete.push(l.filename());
-            self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
+            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
        }
        updates.flush();
        drop(layers);
@@ -3676,7 +3739,7 @@ impl Timeline {

        fail_point!("before-timeline-gc");

-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        let state = *self.state.borrow();
        if state == TimelineState::Stopping {
@@ -3696,7 +3759,7 @@ impl Timeline {

        let res = self
            .gc_timeline(
-                &layer_removal_cs,
+                layer_removal_cs.clone(),
                horizon_cutoff,
                pitr_cutoff,
                retain_lsns,
@@ -3715,7 +3778,7 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        horizon_cutoff: Lsn,
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
@@ -3855,7 +3918,7 @@ impl Timeline {
                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
                // But image layers are in any case less sparse than delta layers. Also we need some
                // protection from replacing recent image layers with new one after each GC iteration.
-                if l.is_incremental() && !LayerMap::is_l0(&*l) {
+                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
                    wanted_image_layers.add_range(l.get_key_range());
                }
                result.layers_not_updated += 1;
@@ -3888,7 +3951,11 @@ impl Timeline {
            {
                for doomed_layer in layers_to_remove {
                    layer_names_to_delete.push(doomed_layer.filename());
-                    self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
+                    self.delete_historic_layer(
+                        layer_removal_cs.clone(),
+                        doomed_layer,
+                        &mut updates,
+                    )?; // FIXME: schedule succeeded deletions before returning?
                    result.layers_removed += 1;
                }
            }
@@ -4060,7 +4127,7 @@ impl Timeline {
                // Does retries + exponential back-off internally.
                // When this fails, don't layer further retry attempts here.
                let result = remote_client
-                    .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
+                    .download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata)
                    .await;

                if let Ok(size) = &result {
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -29,16 +29,14 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };

-use anyhow::Context;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
-use std::sync::atomic::{self, AtomicBool};
-use std::sync::{Arc, Weak};
+use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
 use tokio::select;
-use tokio::sync::{watch, RwLock};
+use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

@@ -62,46 +60,23 @@ pub struct WalReceiverConf {

 pub struct WalReceiver {
    timeline: TenantTimelineId,
-    timeline_ref: Weak<Timeline>,
-    conf: WalReceiverConf,
-    started: AtomicBool,
-    manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
+    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }

 impl WalReceiver {
-    pub fn new(
-        timeline: TenantTimelineId,
-        timeline_ref: Weak<Timeline>,
-        conf: WalReceiverConf,
-    ) -> Self {
-        Self {
-            timeline,
-            timeline_ref,
-            conf,
-            started: AtomicBool::new(false),
-            manager_status: Arc::new(RwLock::new(None)),
-        }
-    }
-
    pub fn start(
-        &self,
-        ctx: &RequestContext,
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
        mut broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
-        if self.started.load(atomic::Ordering::Acquire) {
-            anyhow::bail!("Wal receiver is already started");
-        }
-
-        let timeline = self.timeline_ref.upgrade().with_context(|| {
-            format!("walreceiver start on a dropped timeline {}", self.timeline)
-        })?;
-
+        ctx: &RequestContext,
+    ) -> Self {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-        let wal_receiver_conf = self.conf.clone();
-        let loop_status = Arc::clone(&self.manager_status);
+
+        let loop_status = Arc::new(std::sync::RwLock::new(None));
+        let manager_status = Arc::clone(&loop_status);
        task_mgr::spawn(
            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
@@ -113,7 +88,7 @@ impl WalReceiver {
                info!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
-                    wal_receiver_conf,
+                    conf,
                );
                loop {
                    select! {
@@ -137,29 +112,29 @@ impl WalReceiver {
                }

                connection_manager_state.shutdown().await;
-                *loop_status.write().await = None;
+                *loop_status.write().unwrap() = None;
                Ok(())
            }
            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
        );

-        self.started.store(true, atomic::Ordering::Release);
-
-        Ok(())
+        Self {
+            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            manager_status,
+        }
    }

-    pub async fn stop(&self) {
+    pub async fn stop(self) {
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
            Some(self.timeline.tenant_id),
            Some(self.timeline.timeline_id),
        )
        .await;
-        self.started.store(false, atomic::Ordering::Release);
    }

-    pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
-        self.manager_status.read().await.clone()
+    pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
+        self.manager_status.read().unwrap().clone()
    }
 }

--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -29,7 +29,6 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
 use tokio::select;
-use tokio::sync::RwLock;
 use tracing::*;

 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -48,7 +47,7 @@ pub(super) async fn connection_manager_loop_step(
    broker_client: &mut BrokerClientChannel,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
-    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
+    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
    match connection_manager_state
        .timeline
@@ -195,7 +194,7 @@ pub(super) async fn connection_manager_loop_step(
                .change_connection(new_candidate, ctx)
                .await
        }
-        *manager_status.write().await = Some(connection_manager_state.manager_status());
+        *manager_status.write().unwrap() = Some(connection_manager_state.manager_status());
    }
 }

--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -11,10 +11,12 @@ OBJS = \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o
+	walproposer_utils.o \
+	control_plane_connector.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
+SHLIB_LINK = -lcurl

 EXTENSION = neon
 DATA = neon--1.0.sql
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -0,0 +1,830 @@
+/*-------------------------------------------------------------------------
+ *
+ * control_plane_connector.c
+ *	  Captures updates to roles/databases using ProcessUtility_hook and
+ *        sends them to the control ProcessUtility_hook. The changes are sent
+ *        via HTTP to the URL specified by the GUC neon.console_url when the
+ *        transaction commits. Forwarding may be disabled temporarily by
+ *        setting neon.forward_ddl to false.
+ *
+ *        Currently, the transaction may abort AFTER
+ *        changes have already been forwarded, and that case is not handled.
+ *        Subtransactions are handled using a stack of hash tables, which
+ *        accumulate changes. On subtransaction commit, the top of the stack
+ *        is merged with the table below it.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/control_plane_connector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include <curl/curl.h>
+#include "utils/jsonb.h"
+
+static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+
+/* GUCs */
+static char *ConsoleURL = NULL;
+static bool ForwardDDL = true;
+
+/* Curl structures for sending the HTTP requests */
+static CURL * CurlHandle;
+static struct curl_slist *ContentHeader = NULL;
+
+/*
+ * CURL docs say that this buffer must exist until we call curl_easy_cleanup
+ * (which we never do), so we make this a static
+ */
+static char CurlErrorBuf[CURL_ERROR_SIZE];
+
+typedef enum
+{
+	Op_Set,						/* An upsert: Either a creation or an alter */
+	Op_Delete,
+}			OpType;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	Oid			owner;
+	char		old_name[NAMEDATALEN];
+	OpType		type;
+}			DbEntry;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	char		old_name[NAMEDATALEN];
+	const char *password;
+	OpType		type;
+}			RoleEntry;
+
+/*
+ * We keep one of these for each subtransaction in a stack. When a subtransaction
+ * commits, we merge the top of the stack into the table below it. It is allocated in the
+ * subtransaction's context.
+ */
+typedef struct DdlHashTable
+{
+	struct DdlHashTable *prev_table;
+	HTAB	   *db_table;
+	HTAB	   *role_table;
+}			DdlHashTable;
+
+static DdlHashTable RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;
+
+static void
+PushKeyValue(JsonbParseState **state, char *key, char *value)
+{
+	JsonbValue	k,
+				v;
+
+	k.type = jbvString;
+	k.val.string.len = strlen(key);
+	k.val.string.val = key;
+	v.type = jbvString;
+	v.val.string.len = strlen(value);
+	v.val.string.val = value;
+	pushJsonbValue(state, WJB_KEY, &k);
+	pushJsonbValue(state, WJB_VALUE, &v);
+}
+
+static char *
+ConstructDeltaMessage()
+{
+	JsonbParseState *state = NULL;
+
+	pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+	if (RootTable.db_table)
+	{
+		JsonbValue	dbs;
+
+		dbs.type = jbvString;
+		dbs.val.string.val = "dbs";
+		dbs.val.string.len = strlen(dbs.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &dbs);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
+
+		hash_seq_init(&status, RootTable.db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->owner != InvalidOid)
+			{
+				PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false));
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+
+	if (RootTable.role_table)
+	{
+		JsonbValue	roles;
+
+		roles.type = jbvString;
+		roles.val.string.val = "roles";
+		roles.val.string.len = strlen(roles.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &roles);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		RoleEntry  *entry;
+
+		hash_seq_init(&status, RootTable.role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->password)
+			{
+				PushKeyValue(&state, "password", (char *) entry->password);
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+	JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+	Jsonb	   *jsonb = JsonbValueToJsonb(result);
+
+	return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ );
+}
+
+#define ERROR_SIZE 1024
+
+typedef struct
+{
+	char		str[ERROR_SIZE];
+	size_t		size;
+}			ErrorString;
+
+static size_t
+ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+	/* Docs say size is always 1 */
+	ErrorString *str = userdata;
+
+	size_t		to_write = nmemb;
+
+	/* +1 for null terminator */
+	if (str->size + nmemb + 1 >= ERROR_SIZE)
+		to_write = ERROR_SIZE - str->size - 1;
+
+	/* Ignore everyrthing past the first ERROR_SIZE bytes */
+	if (to_write == 0)
+		return nmemb;
+	memcpy(str->str + str->size, ptr, to_write);
+	str->size += to_write;
+	str->str[str->size] = '\0';
+	return nmemb;
+}
+
+static void
+SendDeltasToControlPlane()
+{
+	if (!RootTable.db_table && !RootTable.role_table)
+		return;
+	if (!ConsoleURL)
+	{
+		elog(LOG, "ConsoleURL not set, skipping forwarding");
+		return;
+	}
+	if (!ForwardDDL)
+		return;
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str = {};
+
+	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
+	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
+	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
+	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+
+	const int	num_retries = 5;
+	int			curl_status;
+
+	for (int i = 0; i < num_retries; i++)
+	{
+		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+			break;
+		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
+		pg_usleep(1000 * 1000);
+	}
+	if (curl_status != 0)
+	{
+		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
+	}
+	else
+	{
+		long		response_code;
+
+		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		{
+			bool		error_exists = str.size != 0;
+
+			if (response_code != 200)
+			{
+				if (error_exists)
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane: %s",
+						 response_code,
+						 str.str);
+				}
+				else
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane",
+						 response_code);
+				}
+			}
+		}
+	}
+}
+
+static void
+InitDbTableIfNeeded()
+{
+	if (!CurrentDdlTable->db_table)
+	{
+		HASHCTL		db_ctl = {};
+
+		db_ctl.keysize = NAMEDATALEN;
+		db_ctl.entrysize = sizeof(DbEntry);
+		db_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->db_table = hash_create(
+												"Dbs Created",
+												4,
+												&db_ctl,
+												HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+InitRoleTableIfNeeded()
+{
+	if (!CurrentDdlTable->role_table)
+	{
+		HASHCTL		role_ctl = {};
+
+		role_ctl.keysize = NAMEDATALEN;
+		role_ctl.entrysize = sizeof(RoleEntry);
+		role_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->role_table = hash_create(
+												  "Roles Created",
+												  4,
+												  &role_ctl,
+												  HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+PushTable()
+{
+	DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
+
+	new_table->prev_table = CurrentDdlTable;
+	new_table->role_table = NULL;
+	new_table->db_table = NULL;
+	CurrentDdlTable = new_table;
+}
+
+static void
+MergeTable()
+{
+	DdlHashTable *old_table = CurrentDdlTable;
+
+	CurrentDdlTable = old_table->prev_table;
+
+	if (old_table->db_table)
+	{
+		InitDbTableIfNeeded();
+		DbEntry    *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			DbEntry    *to_write = hash_search(
+											   CurrentDdlTable->db_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->owner != InvalidOid)
+				to_write->owner = entry->owner;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				DbEntry    *old = hash_search(
+											  CurrentDdlTable->db_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(
+								CurrentDdlTable->db_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->db_table);
+	}
+
+	if (old_table->role_table)
+	{
+		InitRoleTableIfNeeded();
+		RoleEntry  *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			RoleEntry  *to_write = hash_search(
+											   CurrentDdlTable->role_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->password)
+				to_write->password = entry->password;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				RoleEntry  *old = hash_search(
+											  CurrentDdlTable->role_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(CurrentDdlTable->role_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->role_table);
+	}
+}
+
+static void
+PopTable()
+{
+	/*
+	 * Current table gets freed because it is allocated in aborted
+	 * subtransaction's memory context.
+	 */
+	CurrentDdlTable = CurrentDdlTable->prev_table;
+}
+
+static void
+NeonSubXactCallback(
+					SubXactEvent event,
+					SubTransactionId mySubid,
+					SubTransactionId parentSubid,
+					void *arg)
+{
+	switch (event)
+	{
+		case SUBXACT_EVENT_START_SUB:
+			return PushTable();
+		case SUBXACT_EVENT_COMMIT_SUB:
+			return MergeTable();
+		case SUBXACT_EVENT_ABORT_SUB:
+			return PopTable();
+		default:
+			return;
+	}
+}
+
+static void
+NeonXactCallback(XactEvent event, void *arg)
+{
+	if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT)
+	{
+		SendDeltasToControlPlane();
+	}
+	RootTable.role_table = NULL;
+	RootTable.db_table = NULL;
+	Assert(CurrentDdlTable == &RootTable);
+}
+
+static void
+HandleCreateDb(CreatedbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	DefElem    *downer = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "owner") == 0)
+			downer = defel;
+	}
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->type = Op_Set;
+	if (downer && downer->arg)
+		entry->owner = get_role_oid(defGetString(downer), false);
+	else
+		entry->owner = GetUserId();
+}
+
+static void
+HandleAlterOwner(AlterOwnerStmt *stmt)
+{
+	if (stmt->objectType != OBJECT_DATABASE)
+		return;
+	InitDbTableIfNeeded();
+	const char *name = strVal(stmt->object);
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									name,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
+	entry->type = Op_Set;
+}
+
+static void
+HandleDbRename(RenameStmt *stmt)
+{
+	Assert(stmt->renameType == OBJECT_DATABASE);
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+	DbEntry    *entry_for_new_name = hash_search(
+												 CurrentDdlTable->db_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->owner = entry->owner;
+		hash_search(
+					CurrentDdlTable->db_table,
+					stmt->subname,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->owner = InvalidOid;
+	}
+}
+
+static void
+HandleDropDb(DropdbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	entry->type = Op_Delete;
+	entry->owner = InvalidOid;
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+}
+
+static void
+HandleCreateRole(CreateRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role,
+									HASH_ENTER,
+									&found);
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass && dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleAlterRole(AlterRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	/* We only care about updates to the password */
+	if (!dpass)
+		return;
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role->rolename,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleRoleRename(RenameStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	Assert(stmt->renameType == OBJECT_ROLE);
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+
+	RoleEntry  *entry_for_new_name = hash_search(
+												 CurrentDdlTable->role_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->password = entry->password;
+		hash_search(
+					CurrentDdlTable->role_table,
+					entry->name,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->password = NULL;
+	}
+}
+
+static void
+HandleDropRole(DropRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	ListCell   *item;
+
+	foreach(item, stmt->roles)
+	{
+		RoleSpec   *spec = lfirst(item);
+		bool		found = false;
+		RoleEntry  *entry = hash_search(
+										CurrentDdlTable->role_table,
+										spec->rolename,
+										HASH_ENTER,
+										&found);
+
+		entry->type = Op_Delete;
+		entry->password = NULL;
+		if (!found)
+			memset(entry->old_name, 0, sizeof(entry));
+	}
+}
+
+static void
+HandleRename(RenameStmt *stmt)
+{
+	if (stmt->renameType == OBJECT_DATABASE)
+		return HandleDbRename(stmt);
+	else if (stmt->renameType == OBJECT_ROLE)
+		return HandleRoleRename(stmt);
+}
+
+static void
+NeonProcessUtility(
+				   PlannedStmt *pstmt,
+				   const char *queryString,
+				   bool readOnlyTree,
+				   ProcessUtilityContext context,
+				   ParamListInfo params,
+				   QueryEnvironment *queryEnv,
+				   DestReceiver *dest,
+				   QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+
+	switch (nodeTag(parseTree))
+	{
+		case T_CreatedbStmt:
+			HandleCreateDb(castNode(CreatedbStmt, parseTree));
+			break;
+		case T_AlterOwnerStmt:
+			HandleAlterOwner(castNode(AlterOwnerStmt, parseTree));
+			break;
+		case T_RenameStmt:
+			HandleRename(castNode(RenameStmt, parseTree));
+			break;
+		case T_DropdbStmt:
+			HandleDropDb(castNode(DropdbStmt, parseTree));
+			break;
+		case T_CreateRoleStmt:
+			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
+			break;
+		case T_AlterRoleStmt:
+			HandleAlterRole(castNode(AlterRoleStmt, parseTree));
+			break;
+		case T_DropRoleStmt:
+			HandleDropRole(castNode(DropRoleStmt, parseTree));
+			break;
+		default:
+			break;
+	}
+
+	if (PreviousProcessUtilityHook)
+	{
+		PreviousProcessUtilityHook(
+								   pstmt,
+								   queryString,
+								   readOnlyTree,
+								   context,
+								   params,
+								   queryEnv,
+								   dest,
+								   qc);
+	}
+	else
+	{
+		standard_ProcessUtility(
+								pstmt,
+								queryString,
+								readOnlyTree,
+								context,
+								params,
+								queryEnv,
+								dest,
+								qc);
+	}
+}
+
+extern void
+InitControlPlaneConnector()
+{
+	PreviousProcessUtilityHook = ProcessUtility_hook;
+	ProcessUtility_hook = NeonProcessUtility;
+	RegisterXactCallback(NeonXactCallback, NULL);
+	RegisterSubXactCallback(NeonSubXactCallback, NULL);
+
+	DefineCustomStringVariable(
+							   "neon.console_url",
+							   "URL of the Neon Console, which will be forwarded changes to dbs and roles",
+							   NULL,
+							   &ConsoleURL,
+							   NULL,
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	DefineCustomBoolVariable(
+							 "neon.forward_ddl",
+							 "Controls whether to forward DDL to the control plane",
+							 NULL,
+							 &ForwardDDL,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
+
+	if (!jwt_token)
+	{
+		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
+	}
+
+	if (curl_global_init(CURL_GLOBAL_DEFAULT))
+	{
+		elog(ERROR, "Failed to initialize curl");
+	}
+	if ((CurlHandle = curl_easy_init()) == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
+	{
+		elog(ERROR, "Failed to initialize content header");
+	}
+
+	if (jwt_token)
+	{
+		char		auth_header[8192];
+
+		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
+		{
+			elog(ERROR, "Failed to initialize authorization header");
+		}
+	}
+}
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -0,0 +1,6 @@
+#ifndef CONTROL_PLANE_CONNECTOR_H
+#define CONTROL_PLANE_CONNECTOR_H
+
+void		InitControlPlaneConnector();
+
+#endif
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -25,6 +25,7 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
+#include "control_plane_connector.h"

 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -34,7 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+	InitControlPlaneConnector();

+        // Important: This must happen after other parts of the extension
+        // are loaded, otherwise any settings to GUCs that were set before
+        // the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use utils::{project_git_version, sentry_init::init_sentry};

-use tracing::{error, info, warn};
+use tracing::{error, info, warn, Instrument};

 project_git_version!(GIT_VERSION);

@@ -141,7 +141,6 @@ async fn task_main(
        tokio::select! {
            accept_result = listener.accept() => {
                let (socket, peer_addr) = accept_result?;
-                info!("accepted postgres client connection from {peer_addr}");

                let session_id = uuid::Uuid::new_v4();
                let tls_config = Arc::clone(&tls_config);
@@ -149,18 +148,18 @@ async fn task_main(

                connections.spawn(
                    async move {
-                        info!("spawned a task for {peer_addr}");
-
                        socket
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

-                        handle_client(dest_suffix, tls_config, session_id, socket).await
+                        info!(%peer_addr, "serving");
+                        handle_client(dest_suffix, tls_config, socket).await
                    }
                    .unwrap_or_else(|e| {
                        // Acknowledge that the task has finished with an error.
                        error!("per-client task finished with an error: {e:#}");
-                    }),
+                    })
+                    .instrument(tracing::info_span!("handle_client", ?session_id))
                );
            }
            _ = cancellation_token.cancelled() => {
@@ -192,7 +191,6 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    let mut stream = PqStream::new(Stream::from_raw(raw_stream));

    let msg = stream.read_startup_packet().await?;
-    info!("received {msg:?}");
    use pq_proto::FeStartupPacket::*;

    match msg {
@@ -215,15 +213,19 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
            }
            Ok(raw.upgrade(tls_config).await?)
        }
-        _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
+        unexpected => {
+            info!(
+                ?unexpected,
+                "unexpected startup packet, rejecting connection"
+            );
+            stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
+        }
    }
 }

-#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 async fn handle_client(
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
-    session_id: uuid::Uuid,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
    let tls_stream = ssl_handshake(stream, tls_config).await?;
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -19,8 +19,10 @@ use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
 use tracing::*;

+use crate::metrics::BROKER_ITERATION_TIMELINES;
 use crate::metrics::BROKER_PULLED_UPDATES;
 use crate::metrics::BROKER_PUSHED_UPDATES;
+use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;

@@ -61,8 +63,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
                BROKER_PUSHED_UPDATES.inc();
            }
            let elapsed = now.elapsed();
-            // Log duration every second. Should be about 10MB of logs per day.
-            info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+
+            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
+            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+
+            if elapsed > push_interval / 2 {
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+            }
+
            sleep(push_interval).await;
        }
    };
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -125,6 +125,25 @@ pub static BACKUP_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_backup_errors_total counter")
 });
+pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_push_update_seconds",
+        "Seconds to push all timeline updates to the broker",
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
+});
+pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[
+    1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0,
+];
+pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_iteration_timelines",
+        "Count of timelines pushed to the broker in a single iteration",
+        TIMELINES_COUNT_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
+});

 pub const LABEL_UNKNOWN: &str = "unknown";

--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -1,5 +1,5 @@
 //
-// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch.
 //
 // The comment is updated on each run with the latest results.
 //
@@ -7,7 +7,7 @@
 // - uses: actions/github-script@v6
 //   with:
 //     script: |
-//       const script = require("./scripts/pr-comment-test-report.js")
+//       const script = require("./scripts/comment-test-report.js")
 //       await script({
 //         github,
 //         context,
@@ -35,8 +35,12 @@ class DefaultMap extends Map {
 module.exports = async ({ github, context, fetch, report }) => {
    // Marker to find the comment in the subsequent runs
    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // If we run the script in the PR or in the branch (main/release/...)
+    const isPullRequest = !!context.payload.pull_request
+    // Latest commit in PR or in the branch
+    const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
    // Let users know that the comment is updated automatically
-    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:</sub></div>`
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
    const githubActionsBotId = 41898282
    // Commend body itself
@@ -166,22 +170,39 @@ module.exports = async ({ github, context, fetch, report }) => {

    commentBody += autoupdateNotice

-    const { data: comments } = await github.rest.issues.listComments({
-        issue_number: context.payload.number,
+    let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
+    if (isPullRequest) {
+        createCommentFn  = github.rest.issues.createComment
+        listCommentsFn   = github.rest.issues.listComments
+        updateCommentFn  = github.rest.issues.updateComment
+        issueNumberOrSha = {
+            issue_number: context.payload.number,
+        }
+    } else {
+        updateCommentFn  = github.rest.repos.updateCommitComment
+        listCommentsFn   = github.rest.repos.listCommentsForCommit
+        createCommentFn  = github.rest.repos.createCommitComment
+        issueNumberOrSha = {
+            commit_sha: commitSha,
+        }
+    }
+
+    const { data: comments } = await listCommentsFn({
+        ...issueNumberOrSha,
        ...ownerRepoParams,
    })

    const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
    if (comment) {
-        await github.rest.issues.updateComment({
+        await updateCommentFn({
            comment_id: comment.id,
            body: commentBody,
            ...ownerRepoParams,
        })
    } else {
-        await github.rest.issues.createComment({
-            issue_number: context.payload.number,
+        await createCommentFn({
            body: commentBody,
+            ...issueNumberOrSha,
            ...ownerRepoParams,
        })
    }
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -156,7 +156,9 @@ class LLVM:
             profdata: Path,
             objects: List[str],
             sources: List[str],
-             demangler: Optional[Path] = None) -> None:
+             demangler: Optional[Path] = None,
+             output_file: Optional[Path] = None,
+             ) -> None:

        cwd = self.cargo.cwd
        objects = list(intersperse('-object', objects))
@@ -180,14 +182,18 @@ class LLVM:
            *objects,
            *sources,
        ]
-        subprocess.check_call(cmd, cwd=cwd)
+        if output_file is not None:
+            with output_file.open('w') as outfile:
+                subprocess.check_call(cmd, cwd=cwd, stdout=outfile)
+        else:
+            subprocess.check_call(cmd, cwd=cwd)

    def cov_report(self, **kwargs) -> None:
        self._cov(subcommand='report', **kwargs)

-    def cov_export(self, *, kind: str, **kwargs) -> None:
+    def cov_export(self, *, kind: str, output_file: Optional[Path], **kwargs) -> None:
        extras = (f'-format={kind}', )
-        self._cov(subcommand='export', *extras, **kwargs)
+        self._cov(subcommand='export', *extras, output_file=output_file, **kwargs)

    def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
        extras = [f'-format={kind}']
@@ -283,9 +289,12 @@ class TextReport(Report):
        self.llvm.cov_show(kind='text', **self._common_kwargs())


+@dataclass
 class LcovReport(Report):
+    output_file: Path
+
    def generate(self) -> None:
-        self.llvm.cov_export(kind='lcov', **self._common_kwargs())
+        self.llvm.cov_export(kind='lcov',  output_file=self.output_file, **self._common_kwargs())


@dataclass
@@ -475,7 +484,7 @@ class State:
            'text':
            lambda: TextReport(**params),
            'lcov':
-            lambda: LcovReport(**params),
+            lambda: LcovReport(**params, output_file=self.report_dir / 'lcov.info'),
            'summary':
            lambda: SummaryReport(**params),
            'github':
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -535,8 +535,8 @@ def export_timeline(


 def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 14
-    psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql")
+    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
+    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")

    old_pageserver_host = args.old_pageserver_host
    new_pageserver_host = args.new_pageserver_host
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -40,6 +40,9 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
+//
+// NB: this function is not async, but still must be run on a tokio runtime thread
+// because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call.
 pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
    U: std::convert::TryInto<Uri>,
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -65,12 +65,19 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_bucket",
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
+    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
+    "pageserver_getpage_get_reconstruct_data_seconds_count",
+    "pageserver_getpage_get_reconstruct_data_seconds_sum",
    "pageserver_io_operations_bytes_total",
    "pageserver_io_operations_seconds_bucket",
    "pageserver_io_operations_seconds_count",
    "pageserver_io_operations_seconds_sum",
    "pageserver_last_record_lsn",
    "pageserver_materialized_cache_hits_total",
+    "pageserver_materialized_cache_hits_direct_total",
+    "pageserver_read_num_fs_layers_bucket",
+    "pageserver_read_num_fs_layers_count",
+    "pageserver_read_num_fs_layers_sum",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1603,8 +1603,6 @@ class NeonPageserver(PgProtocol):
            # https://github.com/neondatabase/neon/issues/2442
            ".*could not remove ephemeral file.*No such file or directory.*",
            # FIXME: These need investigation
-            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
-            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
            ".*Removing intermediate uninit mark file.*",
@@ -1621,6 +1619,10 @@ class NeonPageserver(PgProtocol):
            ".*task iteration took longer than the configured period.*",
            # this is until #3501
            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
+            # these can happen anytime we do compactions from background task and shutdown pageserver
+            r".*ERROR.*ancestor timeline \S+ is being stopped",
+            # this is expected given our collaborative shutdown approach for the UploadQueue
+            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
        ]

    def start(
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -155,14 +155,14 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def tenant_create(
-        self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None
+        self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
    ) -> TenantId:
        if conf is not None:
            assert "new_tenant_id" not in conf.keys()
        res = self.post(
            f"http://localhost:{self.port}/v1/tenant",
            json={
-                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+                "new_tenant_id": str(new_tenant_id),
                **(conf or {}),
            },
        )
@@ -293,13 +293,13 @@ class PageserverHttpClient(requests.Session):
        self,
        pg_version: PgVersion,
        tenant_id: TenantId,
-        new_timeline_id: Optional[TimelineId] = None,
+        new_timeline_id: TimelineId,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
        **kwargs,
    ) -> Dict[Any, Any]:
        body: Dict[str, Any] = {
-            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
+            "new_timeline_id": str(new_timeline_id),
            "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
            "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
        }
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,10 @@ class PgVersion(str, enum.Enum):
    def __repr__(self) -> str:
        return f"'{self.value}'"

+    # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums
+    def __str__(self) -> str:
+        return self.value
+
    # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
    # sometime we need to do so in tests.
    @property
@@ -78,11 +82,11 @@ def pytest_addoption(parser: Parser):
@pytest.fixture(scope="session")
 def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
    if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version commad-line argument"
+        version, source = v, "from --pg-version command-line argument"
    elif v := os.environ.get("DEFAULT_PG_VERSION"):
        version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
    else:
-        version, source = DEFAULT_VERSION, "default verson"
+        version, source = DEFAULT_VERSION, "default version"

    log.info(f"pg_version is {version} ({source})")
    yield version
--- a/test_runner/performance/test_duplicate_layers.py
+++ b/test_runner/performance/test_duplicate_layers.py
@@ -1,43 +0,0 @@
-import time
-
-import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-
-
-# Test duplicate layer detection
-#
-# This test sets fail point at the end of first compaction phase:
-# after flushing new L1 layers but before deletion of L0 layes
-# It should cause generation of duplicate L1 layer by compaction after restart
-@pytest.mark.timeout(600)
-def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-
-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-    #env.pageserver.allowed_errors.append(".*duplicate layer.*")
-
-    pageserver_http = env.pageserver.http_client()
-
-    # Use aggressive compaction and checkpoint settings
-    tenant_id, _ = env.neon_cli.create_tenant(
-        conf={
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            "compaction_threshold": "5",
-        }
-    )
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
-
-    pageserver_http.configure_failpoints(("compact-level0-phase1-finish", "exit"))
-
-    with pytest.raises(Exception):
-        pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
-    time.sleep(6)  # let compaction to be performed
-    env.pageserver.stop()
-    env.pageserver.start()
-    time.sleep(6)  # let compaction to be performed
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -158,6 +158,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
            "threshold": "23h",
        },
        "evictions_low_residence_duration_metric_threshold": "2days",
+        "gc_feedback": True,
        "gc_horizon": 23 * (1024 * 1024),
        "gc_period": "2h 13m",
        "image_creation_threshold": 7,
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -3,7 +3,7 @@ from contextlib import closing
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId


 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
@@ -25,21 +25,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    ps.safe_psql("set FOO", password=tenant_token)
    ps.safe_psql("set FOO", password=pageserver_token)

-    new_timeline_id = env.neon_cli.create_branch(
-        "test_pageserver_auth", tenant_id=env.initial_tenant
-    )
-
    # tenant can create branches
    tenant_http_client.timeline_create(
        pg_version=env.pg_version,
        tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
    )
    # console can create branches for tenant
    pageserver_http_client.timeline_create(
        pg_version=env.pg_version,
        tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
    )

    # fail to create branch using token with different tenant_id
@@ -49,18 +47,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
        invalid_tenant_http_client.timeline_create(
            pg_version=env.pg_version,
            tenant_id=env.initial_tenant,
-            ancestor_timeline_id=new_timeline_id,
+            new_timeline_id=TimelineId.generate(),
+            ancestor_timeline_id=env.initial_timeline,
        )

    # create tenant using management token
-    pageserver_http_client.tenant_create()
+    pageserver_http_client.tenant_create(TenantId.generate())

    # fail to create tenant using tenant token
    with pytest.raises(
        PageserverApiException,
        match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
    ):
-        tenant_http_client.tenant_create()
+        tenant_http_client.tenant_create(TenantId.generate())


 def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
            ".*failed to load metadata.*",
-            ".*could not load tenant.*load local timeline.*",
+            ".*load failed.*load local timeline.*",
        ]
    )

--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -0,0 +1,210 @@
+from types import TracebackType
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import VanillaPostgres
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+def handle_db(dbs, roles, operation):
+    if operation["op"] == "set":
+        if "old_name" in operation and operation["old_name"] in dbs:
+            dbs[operation["name"]] = dbs[operation["old_name"]]
+            dbs.pop(operation["old_name"])
+        if "owner" in operation:
+            dbs[operation["name"]] = operation["owner"]
+    elif operation["op"] == "del":
+        dbs.pop(operation["name"])
+    else:
+        raise ValueError("Invalid op")
+
+
+def handle_role(dbs, roles, operation):
+    if operation["op"] == "set":
+        if "old_name" in operation and operation["old_name"] in roles:
+            roles[operation["name"]] = roles[operation["old_name"]]
+            roles.pop(operation["old_name"])
+            for db, owner in dbs.items():
+                if owner == operation["old_name"]:
+                    dbs[db] = operation["name"]
+        if "password" in operation:
+            roles[operation["name"]] = operation["password"]
+    elif operation["op"] == "del":
+        if "old_name" in operation:
+            roles.pop(operation["old_name"])
+        roles.pop(operation["name"])
+    else:
+        raise ValueError("Invalid op")
+
+
+fail = False
+
+
+def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response:
+    log.info(f"Received request with data {request.get_data(as_text=True)}")
+    if fail:
+        log.info("FAILING")
+        return Response(status=500, response="Failed just cuz")
+    if request.json is None:
+        log.info("Received invalid JSON")
+        return Response(status=400)
+    json = request.json
+    # Handle roles first
+    if "roles" in json:
+        for operation in json["roles"]:
+            handle_role(dbs, roles, operation)
+    if "dbs" in json:
+        for operation in json["dbs"]:
+            handle_db(dbs, roles, operation)
+    return Response(status=200)
+
+
+class DdlForwardingContext:
+    def __init__(self, httpserver: HTTPServer, vanilla_pg: VanillaPostgres, host: str, port: int):
+        self.server = httpserver
+        self.pg = vanilla_pg
+        self.host = host
+        self.port = port
+        self.dbs: Dict[str, str] = {}
+        self.roles: Dict[str, str] = {}
+        endpoint = "/management/api/v2/roles_and_databases"
+        ddl_url = f"http://{host}:{port}{endpoint}"
+        self.pg.configure(
+            [
+                f"neon.console_url={ddl_url}",
+                "shared_preload_libraries = 'neon'",
+            ]
+        )
+        log.info(f"Listening on {ddl_url}")
+        self.server.expect_request(endpoint, method="PATCH").respond_with_handler(
+            lambda request: ddl_forward_handler(request, self.dbs, self.roles)
+        )
+
+    def __enter__(self):
+        self.pg.start()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
+        self.pg.stop()
+
+    def send(self, query: str) -> List[Tuple[Any, ...]]:
+        return self.pg.safe_psql(query)
+
+    def wait(self, timeout=3):
+        self.server.wait(timeout=timeout)
+
+    def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]:
+        res = self.send(query)
+        self.wait(timeout=timeout)
+        return res
+
+
+@pytest.fixture(scope="function")
+def ddl(
+    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
+):
+    (host, port) = httpserver_listen_address
+    with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:
+        yield ddl
+
+
+def test_ddl_forwarding(ddl: DdlForwardingContext):
+    curr_user = ddl.send("SELECT current_user")[0][0]
+    log.info(f"Current user is {curr_user}")
+    ddl.send_and_wait("CREATE DATABASE bork")
+    assert ddl.dbs == {"bork": curr_user}
+    ddl.send_and_wait("CREATE ROLE volk WITH PASSWORD 'nu_zayats'")
+    ddl.send_and_wait("ALTER DATABASE bork RENAME TO nu_pogodi")
+    assert ddl.dbs == {"nu_pogodi": curr_user}
+    ddl.send_and_wait("ALTER DATABASE nu_pogodi OWNER TO volk")
+    assert ddl.dbs == {"nu_pogodi": "volk"}
+    ddl.send_and_wait("DROP DATABASE nu_pogodi")
+    assert ddl.dbs == {}
+    ddl.send_and_wait("DROP ROLE volk")
+    assert ddl.roles == {}
+
+    ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'")
+    assert ddl.roles == {"tarzan": "of_the_apes"}
+    ddl.send_and_wait("DROP ROLE tarzan")
+    assert ddl.roles == {}
+    ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'")
+    assert ddl.roles == {"tarzan": "of_the_apes"}
+    ddl.send_and_wait("ALTER ROLE tarzan WITH PASSWORD 'jungle_man'")
+    assert ddl.roles == {"tarzan": "jungle_man"}
+    ddl.send_and_wait("ALTER ROLE tarzan RENAME TO mowgli")
+    assert ddl.roles == {"mowgli": "jungle_man"}
+    ddl.send_and_wait("DROP ROLE mowgli")
+    assert ddl.roles == {}
+
+    conn = ddl.pg.connect()
+    cur = conn.cursor()
+
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'cork'")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"bork": "cork"}
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE stork WITH PASSWORD 'pork'")
+    cur.execute("ABORT")
+    ddl.wait()
+    assert ("stork", "pork") not in ddl.roles.items()
+    cur.execute("BEGIN")
+    cur.execute("ALTER ROLE bork WITH PASSWORD 'pork'")
+    cur.execute("ALTER ROLE bork RENAME TO stork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"stork": "pork"}
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE dork WITH PASSWORD 'york'")
+    cur.execute("SAVEPOINT point")
+    cur.execute("ALTER ROLE dork WITH PASSWORD 'zork'")
+    cur.execute("ALTER ROLE dork RENAME TO fork")
+    cur.execute("ROLLBACK TO SAVEPOINT point")
+    cur.execute("ALTER ROLE dork WITH PASSWORD 'fork'")
+    cur.execute("ALTER ROLE dork RENAME TO zork")
+    cur.execute("RELEASE SAVEPOINT point")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"stork": "pork", "zork": "fork"}
+
+    cur.execute("DROP ROLE stork")
+    cur.execute("DROP ROLE zork")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
+    cur.execute("CREATE ROLE stork WITH PASSWORD 'cork'")
+    cur.execute("BEGIN")
+    cur.execute("DROP ROLE bork")
+    cur.execute("ALTER ROLE stork RENAME TO bork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"bork": "cork"}
+
+    cur.execute("DROP ROLE bork")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
+    cur.execute("CREATE DATABASE stork WITH OWNER=bork")
+    cur.execute("ALTER ROLE bork RENAME TO cork")
+    ddl.wait()
+    assert ddl.dbs == {"stork": "cork"}
+
+    with pytest.raises(psycopg2.InternalError):
+        global fail
+        fail = True
+        cur.execute("CREATE DATABASE failure WITH OWNER=cork")
+        ddl.wait()
+
+    conn.close()
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -228,7 +228,6 @@ def proxy_with_metric_collector(
@pytest.mark.asyncio
 async def test_proxy_metric_collection(
    httpserver: HTTPServer,
-    httpserver_listen_address,
    proxy_with_metric_collector: NeonProxy,
    vanilla_pg: VanillaPostgres,
 ):
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -140,7 +140,7 @@ def test_remote_storage_backup_and_restore(
    # This is before the failures injected by test_remote_failures, so it's a permanent error.
    pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
    env.pageserver.allowed_errors.append(
-        ".*error attaching tenant: storage-sync-list-remote-timelines",
+        ".*attach failed.*: storage-sync-list-remote-timelines",
    )
    # Attach it. This HTTP request will succeed and launch a
    # background task to load the tenant. In that background task,
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -37,6 +37,7 @@ class PgSniRouter(PgProtocol):
        destination: str,
        tls_cert: Path,
        tls_key: Path,
+        test_output_dir: Path,
    ):
        # Must use a hostname rather than IP here, for SNI to work
        host = "localhost"
@@ -49,6 +50,7 @@ class PgSniRouter(PgProtocol):
        self.tls_cert = tls_cert
        self.tls_key = tls_key
        self._popen: Optional[subprocess.Popen[bytes]] = None
+        self.test_output_dir = test_output_dir

    def start(self) -> "PgSniRouter":
        assert self._popen is None
@@ -60,8 +62,12 @@ class PgSniRouter(PgProtocol):
            *["--destination", self.destination],
        ]

-        self._popen = subprocess.Popen(args)
+        router_log_path = self.test_output_dir / "pg_sni_router.log"
+        router_log = open(router_log_path, "w")
+
+        self._popen = subprocess.Popen(args, stderr=router_log)
        self._wait_until_ready()
+        log.info(f"pg_sni_router started, log file: {router_log_path}")
        return self

    @backoff.on_exception(backoff.expo, OSError, max_time=10)
@@ -121,6 +127,7 @@ def test_pg_sni_router(
        destination="localtest.me",
        tls_cert=test_output_dir / "router.crt",
        tls_key=test_output_dir / "router.key",
+        test_output_dir=test_output_dir,
    ) as router:
        router.start()

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -647,7 +647,9 @@ def test_ignored_tenant_stays_broken_without_metadata(
            metadata_removed = True
    assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"

-    env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")
+    env.pageserver.allowed_errors.append(
+        f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
+    )

    # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
    pageserver_http.tenant_load(tenant_id=tenant_id)
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -22,6 +22,7 @@ from fixtures.neon_fixtures import (
    available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 from prometheus_client.samples import Sample


@@ -308,27 +309,26 @@ def test_pageserver_with_empty_tenants(
    env.pageserver.allowed_errors.append(
        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
    )
-    env.pageserver.allowed_errors.append(
-        ".*could not load tenant.*Failed to list timelines directory.*"
-    )
+    env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")

    client = env.pageserver.http_client()

-    tenant_with_empty_timelines_dir = client.tenant_create()
-    temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir)
+    tenant_with_empty_timelines = TenantId.generate()
+    client.tenant_create(tenant_with_empty_timelines)
+    temp_timelines = client.timeline_list(tenant_with_empty_timelines)
    for temp_timeline in temp_timelines:
        client.timeline_delete(
-            tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"])
+            tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
        )
    files_in_timelines_dir = sum(
        1
        for _p in Path.iterdir(
-            Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines"
+            Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines) / "timelines"
        )
    )
    assert (
        files_in_timelines_dir == 0
-    ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
+    ), f"Tenant {tenant_with_empty_timelines} should have an empty timelines/ directory"

    # Trigger timeline re-initialization after pageserver restart
    env.endpoints.stop_all()
@@ -340,9 +340,15 @@ def test_pageserver_with_empty_tenants(
    env.pageserver.start()

    client = env.pageserver.http_client()
-    tenants = client.tenant_list()

-    assert len(tenants) == 2
+    def not_loading():
+        tenants = client.tenant_list()
+        assert len(tenants) == 2
+        assert all(t["state"]["slug"] != "Loading" for t in tenants)
+
+    wait_until(10, 0.2, not_loading)
+
+    tenants = client.tenant_list()

    [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
    assert (
@@ -354,17 +360,17 @@ def test_pageserver_with_empty_tenants(
        broken_tenant_status["state"]["slug"] == "Broken"
    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"

-    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+    assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*")

-    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
+    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
    assert (
        loaded_tenant["state"]["slug"] == "Active"
-    ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+    ), "Tenant {tenant_with_empty_timelines} with empty timelines dir should be active and ready for timeline creation"

-    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines)
    assert (
        loaded_tenant_status["state"]["slug"] == "Active"
-    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+    ), f"Tenant {tenant_with_empty_timelines} without timelines dir should be active"

    time.sleep(1)  # to allow metrics propagation

@@ -374,7 +380,7 @@ def test_pageserver_with_empty_tenants(
        "state": "Broken",
    }
    active_tenants_metric_filter = {
-        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "tenant_id": str(tenant_with_empty_timelines),
        "state": "Active",
    }

@@ -386,7 +392,7 @@ def test_pageserver_with_empty_tenants(

    assert (
        tenant_active_count == 1
-    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+    ), f"Tenant {tenant_with_empty_timelines} should have metric as active"

    tenant_broken_count = int(
        ps_metrics.query_one(
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -371,7 +371,7 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(

        # make the second call and assert behavior
        log.info("second call start")
-        error_msg_re = "another task is already setting the deleted_flag, started at"
+        error_msg_re = "timeline deletion is already in progress"
        with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
            ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
        assert second_call_err.value.status_code == 500
@@ -437,12 +437,22 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):

    wait_until(50, 0.1, got_hangup_log_message)

-    # ok, retry without failpoint, it should succeed
+    # check that the timeline is still present
+    ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
+
+    # ok, disable the failpoint to let the deletion finish
    ps_http.configure_failpoints((failpoint_name, "off"))

-    # this should succeed
-    ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
-    # the second call will try to transition the timeline into Stopping state, but it's already in that state
-    env.pageserver.allowed_errors.append(
-        f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
-    )
+    def first_request_finished():
+        message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
+        assert env.pageserver.log_contains(message)
+
+    wait_until(50, 0.1, first_request_finished)
+
+    # check that the timeline is gone
+    notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found"
+    env.pageserver.allowed_errors.append(".*" + notfound_message)
+    with pytest.raises(PageserverApiException, match=notfound_message) as exc:
+        ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
+
+    assert exc.value.status_code == 404
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -27,7 +27,6 @@ futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
-hashbrown = { version = "0.12", features = ["raw"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -39,7 +38,7 @@ num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
-regex-syntax = { version = "0.6" }
+regex-syntax = { version = "0.7" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
@@ -62,7 +61,6 @@ url = { version = "2", features = ["serde"] }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 either = { version = "1" }
-hashbrown = { version = "0.12", features = ["raw"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -70,7 +68,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
-regex-syntax = { version = "0.6" }
+regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }