Merge pull request #7119 from neondatabase/rc/proxy/2024-03-14

Proxy release 2024-03-14
Merge branch 'release-proxy' into rc/proxy/2024-03-14
2026-08-02 10:50:38 +00:00 · 2024-03-14 14:57:05 +05:00 · 2024-03-14 14:16:36 +05:00 · 2024-03-14 09:11:57 +00:00 · 2024-03-14 08:20:56 +00:00 · 2024-03-13 17:49:17 +00:00
108 changed files with 3657 additions and 1564 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -474,7 +474,7 @@ jobs:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored

      # Temporary disable this step until we figure out why it's so flaky
@@ -554,7 +554,7 @@ jobs:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -282,8 +282,10 @@ dependencies = [
 "control_plane",
 "diesel",
 "diesel_migrations",
+ "fail",
 "futures",
 "git-version",
+ "hex",
 "humantime",
 "hyper",
 "metrics",
--- a/2
+++ b/2
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

 #
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use nix::unistd::Pid;
 use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
@@ -722,8 +723,12 @@ impl ComputeNode {
        // Stop it when it's ready
        info!("waiting for postgres");
        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
+        // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
+        // it to avoid orphaned processes prowling around while datadir is
+        // wiped.
+        let pm_pid = Pid::from_raw(pg.id() as i32);
+        kill(pm_pid, Signal::SIGQUIT)?;
+        info!("sent SIGQUIT signal");
        pg.wait()?;
        info!("done prewarming");

--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -19,8 +19,10 @@ aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 once_cell.workspace = true
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,7 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
@@ -30,7 +32,7 @@ use pageserver_api::controller_api::{
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

-use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
+use control_plane::storage_controller::{AttachHookRequest, InspectRequest};

 /// State available to HTTP request handlers
 #[derive(Clone)]
@@ -174,14 +176,14 @@ async fn handle_tenant_location_config(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
        service
-            .tenant_location_config(tenant_id, config_req)
+            .tenant_location_config(tenant_shard_id, config_req)
            .await?,
    )
 }
@@ -554,6 +556,9 @@ pub fn make_router(
        .post("/debug/v1/consistency_check", |r| {
            request_span(r, handle_consistency_check)
        })
+        .put("/debug/v1/failpoints", |r| {
+            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
+        })
        .get("/control/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(r, handle_tenant_locate)
        })
@@ -587,7 +592,7 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/config", |r| {
            tenant_service_handler(r, handle_tenant_config_get)
        })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
--- a/control_plane/attachment_service/src/id_lock_map.rs
+++ b/control_plane/attachment_service/src/id_lock_map.rs
@@ -0,0 +1,54 @@
+use std::{collections::HashMap, sync::Arc};
+
+/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
+/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
+/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
+/// is needed at a tenant-wide granularity.
+pub(crate) struct IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    /// A synchronous lock for getting/setting the async locks that our callers will wait on.
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+}
+
+impl<T> IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    pub(crate) fn shared(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
+    }
+
+    pub(crate) fn exclusive(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().write_owned()
+    }
+
+    /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
+    /// periodic housekeeping to avoid the map growing indefinitely
+    pub(crate) fn housekeeping(&self) {
+        let mut locked = self.entities.lock().unwrap();
+        locked.retain(|_k, lock| lock.try_write().is_err())
+    }
+}
+
+impl<T> Default for IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    fn default() -> Self {
+        Self {
+            entities: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod compute_hook;
 pub mod http;
+mod id_lock_map;
 pub mod metrics;
 mod node;
 pub mod persistence;
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,9 +1,3 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
@@ -212,6 +206,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }

 fn main() -> anyhow::Result<()> {
+    let default_panic = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        default_panic(info);
+        std::process::exit(1);
+    }));
+
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -83,29 +83,38 @@ impl Node {
        }
    }

-    pub(crate) fn set_availability(
-        &mut self,
-        availability: NodeAvailability,
-    ) -> AvailabilityTransition {
-        use NodeAvailability::*;
-        let transition = match (self.availability, availability) {
-            (Offline, Active) => {
+    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
+        match self.get_availability_transition(availability) {
+            AvailabilityTransition::ToActive => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
                // state.  For example, Reconcilers in flight will have to complete and be spawned
                // again to realize that the node has become available.
                self.cancel = CancellationToken::new();
-                AvailabilityTransition::ToActive
            }
-            (Active, Offline) => {
+            AvailabilityTransition::ToOffline => {
                // Fire the node's cancellation token to cancel any in-flight API requests to it
                self.cancel.cancel();
-                AvailabilityTransition::ToOffline
            }
-            _ => AvailabilityTransition::Unchanged,
-        };
+            AvailabilityTransition::Unchanged => {}
+        }
        self.availability = availability;
-        transition
+    }
+
+    /// Without modifying the availability of the node, convert the intended availability
+    /// into a description of the transition.
+    pub(crate) fn get_availability_transition(
+        &self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use AvailabilityTransition::*;
+        use NodeAvailability::*;
+
+        match (self.availability, availability) {
+            (Offline, Active) => ToActive,
+            (Active, Offline) => ToOffline,
+            _ => Unchanged,
+        }
    }

    /// Whether we may send API requests to this node.
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -11,6 +11,9 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::ShardConfigError;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
@@ -20,7 +23,7 @@ use crate::node::Node;

 /// ## What do we store?
 ///
-/// The attachment service does not store most of its state durably.
+/// The storage controller service does not store most of its state durably.
 ///
 /// The essential things to store durably are:
 /// - generation numbers, as these must always advance monotonically to ensure data safety.
@@ -34,7 +37,7 @@ use crate::node::Node;
 ///
 /// ## Performance/efficiency
 ///
-/// The attachment service does not go via the database for most things: there are
+/// The storage controller service does not go via the database for most things: there are
 /// a couple of places where we must, and where efficiency matters:
 /// - Incrementing generation numbers: the Reconciler has to wait for this to complete
 ///   before it can attach a tenant, so this acts as a bound on how fast things like
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

+#[must_use]
+pub(crate) enum AbortShardSplitStatus {
+    /// We aborted the split in the database by reverting to the parent shards
+    Aborted,
+    /// The split had already been persisted.
+    Complete,
+}
+
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

 impl Persistence {
@@ -570,6 +581,51 @@ impl Persistence {
        })
        .await
    }
+
+    /// Used when the remote part of a shard split failed: we will revert the database state to have only
+    /// the parent shards, with SplitState::Idle.
+    pub(crate) async fn abort_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        new_shard_count: ShardCount,
+    ) -> DatabaseResult<AbortShardSplitStatus> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
+
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
+                            count {new_shard_count:?} on tenant {split_tenant_id}"
+                    )));
+                }
+
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
+
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;
+
+            Ok(aborted)
+        })
+        .await
+    }
 }

 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
@@ -604,6 +660,28 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) config: String,
 }

+impl TenantShardPersistence {
+    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
+        if self.shard_count == 0 {
+            Ok(ShardIdentity::unsharded())
+        } else {
+            Ok(ShardIdentity::new(
+                ShardNumber(self.shard_number as u8),
+                ShardCount::new(self.shard_count as u8),
+                ShardStripeSize(self.shard_stripe_size as u32),
+            )?)
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
 /// Parts of [`crate::node::Node`] that are stored durably
 #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,5 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
+use hyper::StatusCode;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};

+const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
@@ -485,17 +488,29 @@ impl Reconciler {
                )
                .await
            {
-                Some(Ok(observed)) => observed,
+                Some(Ok(observed)) => Some(observed),
+                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
+                    if status == StatusCode::NOT_FOUND =>
+                {
+                    None
+                }
                Some(Err(e)) => return Err(e.into()),
                None => return Err(ReconcileError::Cancel),
            };
            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            self.observed.locations.insert(
-                attached_node.get_id(),
-                ObservedStateLocation {
-                    conf: observed_conf,
-                },
-            );
+            match observed_conf {
+                Some(conf) => {
+                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
+                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
+                    self.observed
+                        .locations
+                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
+                }
+                None => {
+                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
+                    self.observed.locations.remove(&attached_node.get_id());
+                }
+            }
        }

        Ok(())
@@ -525,7 +540,12 @@ impl Reconciler {
                )));
            };

-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
@@ -662,10 +682,26 @@ impl Reconciler {
    }
 }

+/// We tweak the externally-set TenantConfig while configuring
+/// locations, using our awareness of whether secondary locations
+/// are in use to automatically enable/disable heatmap uploads.
+fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
+    let mut config = config.clone();
+    if has_secondaries {
+        if config.heatmap_period.is_none() {
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+        }
+    } else {
+        config.heatmap_period = None;
+    }
+    config
+}
+
 pub(crate) fn attached_location_conf(
    generation: Generation,
    shard: &ShardIdentity,
    config: &TenantConfig,
+    has_secondaries: bool,
 ) -> LocationConfig {
    LocationConfig {
        mode: LocationConfigMode::AttachedSingle,
@@ -674,7 +710,7 @@ pub(crate) fn attached_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, has_secondaries),
    }
 }

@@ -689,6 +725,6 @@ pub(crate) fn secondary_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, true),
    }
 }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -577,7 +577,12 @@ impl TenantState {
                .generation
                .expect("Attempted to enter attached state without a generation");

-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -617,7 +622,7 @@ impl TenantState {
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn maybe_reconcile(
        &mut self,
-        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
        pageservers: &Arc<HashMap<NodeId, Node>>,
        compute_hook: &Arc<ComputeHook>,
        service_config: &service::Config,
@@ -729,6 +734,7 @@ impl TenantState {
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
        metrics::RECONCILER.spawned.inc();
+        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
                // Wait for any previous reconcile task to complete before we start
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,11 +8,11 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
            "start" => rt.block_on(handle_start_all(sub_args, &env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
            "mappings" => handle_mappings(sub_args, &mut env),
@@ -445,14 +445,14 @@ async fn handle_tenant(
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            // We must register the tenant with the attachment service, so
+            // We must register the tenant with the storage controller, so
            // that when the pageserver restarts, it will be re-attached.
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                .tenant_create(TenantCreateRequest {
                    // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
-                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
-                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
                    new_tenant_id: TenantShardId::unsharded(tenant_id),
                    generation: None,
                    shard_parameters: ShardParameters {
@@ -476,9 +476,9 @@ async fn handle_tenant(
                .context("Failed to parse postgres version from the argument string")?;

            // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
-            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // different shards picking different start lsns.  Maybe we have to teach storage controller
            // to let shard 0 branch first and then propagate the chosen LSN to other shards.
-            attachment_service
+            storage_controller
                .tenant_timeline_create(
                    tenant_id,
                    TimelineCreateRequest {
@@ -528,8 +528,8 @@ async fn handle_tenant(
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;

-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                .tenant_migrate(tenant_shard_id, new_pageserver_id)
                .await?;

@@ -543,8 +543,8 @@ async fn handle_tenant(

            let mut tenant_synthetic_size = None;

-            let attachment_service = AttachmentService::from_env(env);
-            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
                let pageserver =
                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);

@@ -585,10 +585,14 @@ async fn handle_tenant(
        Some(("shard-split", matches)) => {
            let tenant_id = get_tenant_id(matches, env)?;
            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();

-            let attachment_service = AttachmentService::from_env(env);
-            let result = attachment_service
-                .tenant_split(tenant_id, shard_count)
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
                .await?;
            println!(
                "Split tenant {} into shards {}",
@@ -613,7 +617,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
            // where shard 0 is attached, and query there.
            let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
            let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
@@ -633,7 +637,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
            let new_timeline_id_opt = parse_timeline_id(create_match)?;
            let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());

-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
                ancestor_timeline_id: None,
@@ -641,7 +645,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                ancestor_start_lsn: None,
                pg_version: Some(pg_version),
            };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
                .await?;

@@ -730,7 +734,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
            let new_timeline_id = TimelineId::generate();
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
                ancestor_timeline_id: Some(ancestor_timeline_id),
@@ -738,7 +742,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                ancestor_start_lsn: start_lsn,
                pg_version: None,
            };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
                .await?;

@@ -767,7 +771,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

    match sub_name {
        "list" => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
            // where shard 0 is attached, and query there.
            let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
            let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
@@ -952,21 +956,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                (
                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
                    // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by attachment service, therefore not sharded.
+                    // full managed by storage controller, therefore not sharded.
                    ShardParameters::DEFAULT_STRIPE_SIZE,
                )
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
-                let attachment_service = AttachmentService::from_env(env);
-                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let storage_controller = StorageController::from_env(env);
+                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
                let pageservers = locate_result
                    .shards
                    .into_iter()
                    .map(|shard| {
                        (
                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Attachment service reported bad hostname"),
+                                .expect("Storage controller reported bad hostname"),
                            shard.listen_pg_port,
                        )
                    })
@@ -1015,8 +1019,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        pageserver.pg_connection_config.port(),
                    )]
                } else {
-                    let attachment_service = AttachmentService::from_env(env);
-                    attachment_service
+                    let storage_controller = StorageController::from_env(env);
+                    storage_controller
                        .tenant_locate(endpoint.tenant_id)
                        .await?
                        .shards
@@ -1024,7 +1028,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        .map(|shard| {
                            (
                                Host::parse(&shard.listen_pg_addr)
-                                    .expect("Attachment service reported malformed host"),
+                                    .expect("Storage controller reported malformed host"),
                                shard.listen_pg_port,
                            )
                        })
@@ -1100,9 +1104,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args), *register)
+                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1131,7 +1134,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args), false)
+                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1144,8 +1147,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            let scheduling = subcommand_args.get_one("scheduling");
            let availability = subcommand_args.get_one("availability");

-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                .node_configure(NodeConfigureRequest {
                    node_id: pageserver.conf.id,
                    scheduling: scheduling.cloned(),
@@ -1170,11 +1173,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_attachment_service(
+async fn handle_storage_controller(
    sub_match: &ArgMatches,
    env: &local_env::LocalEnv,
 ) -> Result<()> {
-    let svc = AttachmentService::from_env(env);
+    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
        Some(("start", _start_match)) => {
            if let Err(e) = svc.start().await {
@@ -1194,8 +1197,8 @@ async fn handle_attachment_service(
                exit(1);
            }
        }
-        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
-        None => bail!("no attachment_service subcommand provided"),
+        Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
+        None => bail!("no storage_controller subcommand provided"),
    }
    Ok(())
 }
@@ -1280,11 +1283,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->

    broker::start_broker_process(env).await?;

-    // Only start the attachment service if the pageserver is configured to need it
+    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start().await {
-            eprintln!("attachment_service start failed: {:#}", e);
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller.start().await {
+            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
        }
@@ -1293,7 +1296,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
        if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match), true)
+            .start(&pageserver_config_overrides(sub_match))
            .await
        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
@@ -1356,9 +1359,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    }

    if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate).await {
-            eprintln!("attachment service stop failed: {e:#}");
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller.stop(immediate).await {
+            eprintln!("storage controller stop failed: {e:#}");
        }
    }
 }
@@ -1586,6 +1589,7 @@ fn cli() -> Command {
                .about("Increase the number of shards in the tenant")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                )
        )
        .subcommand(
@@ -1596,11 +1600,7 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
-                    .long("register")
-                    .default_value("true").required(false)
-                    .value_parser(value_parser!(bool))
-                    .value_name("register"))
+                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1618,9 +1618,9 @@ fn cli() -> Command {
                )
        )
        .subcommand(
-            Command::new("attachment_service")
+            Command::new("storage_controller")
                .arg_required_else_help(true)
-                .about("Manage attachment_service")
+                .about("Manage storage_controller")
                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

-use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
+use crate::storage_controller::StorageController;

 use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
@@ -750,17 +750,17 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        // If we weren't given explicit pageservers, query the attachment service
+        // If we weren't given explicit pageservers, query the storage controller
        if pageservers.is_empty() {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            let storage_controller = StorageController::from_env(&self.env);
+            let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
            pageservers = locate_result
                .shards
                .into_iter()
                .map(|shard| {
                    (
                        Host::parse(&shard.listen_pg_addr)
-                            .expect("Attachment service reported bad hostname"),
+                            .expect("Storage controller reported bad hostname"),
                        shard.listen_pg_port,
                    )
                })
@@ -774,7 +774,10 @@ impl Endpoint {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

-        let client = reqwest::Client::new();
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(30))
+            .build()
+            .unwrap();
        let response = client
            .post(format!(
                "http://{}:{}/configure",
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,7 +6,6 @@
 //! local installations.
 #![deny(clippy::undocumented_unsafe_blocks)]

-pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
@@ -14,3 +13,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod storage_controller;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -72,13 +72,13 @@ pub struct LocalEnv {
    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

-    // Control plane upcall API for pageserver: if None, we will not run attachment_service.  If set, this will
+    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
    #[serde(default)]
    pub control_plane_api: Option<Url>,

-    // Control plane upcall API for attachment service.  If set, this will be propagated into the
-    // attachment service's configuration.
+    // Control plane upcall API for storage controller.  If set, this will be propagated into the
+    // storage controller's configuration.
    #[serde(default)]
    pub control_plane_compute_hook_api: Option<Url>,

@@ -227,10 +227,10 @@ impl LocalEnv {
        self.neon_distrib_dir.join("pageserver")
    }

-    pub fn attachment_service_bin(&self) -> PathBuf {
-        // Irrespective of configuration, attachment service binary is always
+    pub fn storage_controller_bin(&self) -> PathBuf {
+        // Irrespective of configuration, storage controller binary is always
        // run from the same location as neon_local.  This means that for compatibility
-        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        // tests that run old pageserver/safekeeper, they still run latest storage controller.
        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
        neon_local_bin_dir.join("storage_controller")
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -31,7 +30,6 @@ use utils::{
    lsn::Lsn,
 };

-use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -111,7 +109,7 @@ impl PageServerNode {
                control_plane_api.as_str()
            ));

-            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
@@ -163,8 +161,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false, register).await
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false).await
    }

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -202,6 +200,28 @@ impl PageServerNode {
            String::from_utf8_lossy(&init_output.stderr),
        );

+        // Write metadata file, used by pageserver on startup to register itself with
+        // the storage controller
+        let metadata_path = datadir.join("metadata.json");
+
+        let (_http_host, http_port) =
+            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
+        let http_port = http_port.unwrap_or(9898);
+        // Intentionally hand-craft JSON: this acts as an implicit format compat test
+        // in case the pageserver-side structure is edited, and reflects the real life
+        // situation: the metadata is written by some other script.
+        std::fs::write(
+            metadata_path,
+            serde_json::to_vec(&serde_json::json!({
+                "host": "localhost",
+                "port": self.pg_connection_config.port(),
+                "http_host": "localhost",
+                "http_port": http_port,
+            }))
+            .unwrap(),
+        )
+        .expect("Failed to write metadata file");
+
        Ok(())
    }

@@ -209,27 +229,7 @@ impl PageServerNode {
        &self,
        config_overrides: &[&str],
        update_config: bool,
-        register: bool,
    ) -> anyhow::Result<()> {
-        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
-        // successfully call /re-attach and finish starting up.
-        if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            attachment_service
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -10,7 +10,7 @@ use pageserver_api::{
        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
        TimelineCreateRequest, TimelineInfo,
    },
-    shard::TenantShardId,
+    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -24,7 +24,7 @@ use utils::{
    id::{NodeId, TenantId},
 };

-pub struct AttachmentService {
+pub struct StorageController {
    env: LocalEnv,
    listen: String,
    path: Utf8PathBuf,
@@ -36,7 +36,7 @@ pub struct AttachmentService {

 const COMMAND: &str = "storage_controller";

-const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
@@ -59,7 +59,7 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

-impl AttachmentService {
+impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
            .unwrap()
@@ -136,27 +136,27 @@ impl AttachmentService {
    }

    fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
            .expect("non-Unicode path")
    }

-    /// PIDFile for the postgres instance used to store attachment service state
+    /// PIDFile for the postgres instance used to store storage controller state
    fn postgres_pid_file(&self) -> Utf8PathBuf {
        Utf8PathBuf::from_path_buf(
            self.env
                .base_data_dir
-                .join("attachment_service_postgres.pid"),
+                .join("storage_controller_postgres.pid"),
        )
        .expect("non-Unicode path")
    }

    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
    ///
-    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];

        for v in prefer_versions {
            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
@@ -189,7 +189,7 @@ impl AttachmentService {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "attachment_service";
+        const DB_NAME: &str = "storage_controller";
        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -219,10 +219,10 @@ impl AttachmentService {
    }

    pub async fn start(&self) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the attachment service for persistence.
+        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
            .unwrap()
-            .join("attachment_service_db");
+            .join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let pg_log_path = pg_data_path.join("postgres.log");

@@ -245,7 +245,7 @@ impl AttachmentService {
            .await?;
        };

-        println!("Starting attachment service database...");
+        println!("Starting storage controller database...");
        let db_start_args = [
            "-w",
            "-D",
@@ -256,7 +256,7 @@ impl AttachmentService {
        ];

        background_process::start_process(
-            "attachment_service_db",
+            "storage_controller_db",
            &self.env.base_data_dir,
            pg_bin_dir.join("pg_ctl").as_std_path(),
            db_start_args,
@@ -300,7 +300,7 @@ impl AttachmentService {
        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
-            &self.env.attachment_service_bin(),
+            &self.env.storage_controller_bin(),
            args,
            [(
                "NEON_REPO_DIR".to_string(),
@@ -322,10 +322,10 @@ impl AttachmentService {
    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;

-        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;

-        println!("Stopping attachment service database...");
+        println!("Stopping storage controller database...");
        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
            .args(pg_stop_args)
@@ -344,10 +344,10 @@ impl AttachmentService {
            // fine that stop failed.  Otherwise it is an error that stop failed.
            const PG_STATUS_NOT_RUNNING: i32 = 3;
            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Attachment service data base is already stopped");
+                println!("Storage controller database is already stopped");
                return Ok(());
            } else {
-                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
            }
        }

@@ -368,7 +368,7 @@ impl AttachmentService {
        }
    }

-    /// Simple HTTP request wrapper for calling into attachment service
+    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
        method: hyper::Method,
@@ -496,11 +496,15 @@ impl AttachmentService {
        &self,
        tenant_id: TenantId,
        new_shard_count: u8,
+        new_stripe_size: Option<ShardStripeSize>,
    ) -> anyhow::Result<TenantShardSplitResponse> {
        self.dispatch(
            Method::PUT,
            format!("control/v1/tenant/{tenant_id}/shard_split"),
-            Some(TenantShardSplitRequest { new_shard_count }),
+            Some(TenantShardSplitRequest {
+                new_shard_count,
+                new_stripe_size,
+            }),
        )
        .await
    }
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.

-"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.

-"admin": Provides access to the control plane and admin APIs of the attachment service.
+"admin": Provides access to the control plane and admin APIs of the storage controller.

 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -88,8 +88,6 @@ impl FromStr for NodeAvailability {
    }
 }

-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -198,6 +198,13 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
    pub new_shard_count: u8,
+
+    // A tenant's stripe size is only meaningful the first time their shard count goes
+    // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
+    //
+    // If this is set while the stripe count is being increased from an already >1 value,
+    // then the request will fail with 400.
+    pub new_stripe_size: Option<ShardStripeSize>,
 }

 #[derive(Serialize, Deserialize)]
@@ -419,7 +426,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantShardId,
+    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,11 +6,18 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::shard::TenantShardId;
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};

+/// Upcall message sent by the pageserver to the configured `control_plane_api` on
+/// startup.
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
    pub node_id: NodeId,
+
+    /// Optional inline self-registration: this is useful with the storage controller,
+    /// if the node already has a node_id set.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub register: Option<NodeRegisterRequest>,
 }

 #[derive(Serialize, Deserialize)]
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -123,6 +123,12 @@ impl PageserverFeedback {
                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
                    }
                }
+                b"shard_number" => {
+                    let len = buf.get_i32();
+                    // TODO: this will be implemented in the next update,
+                    //  for now, we just skip the value.
+                    buf.advance(len as usize);
+                }
                _ => {
                    let len = buf.get_i32();
                    warn!(
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    }
 }

-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+        (*api).process_safekeeper_feedback(&mut (*wp))
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
        todo!()
    }

-    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
        todo!()
    }

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -257,7 +257,7 @@ impl Client {
        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
-            tenant_id: tenant_shard_id,
+            tenant_id: Some(tenant_shard_id),
            config,
        };

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,3 +1,5 @@
+#![recursion_limit = "300"]
+
 //! Main entry point for the Page Server executable.

 use std::env::{var, VarError};
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,9 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde;
 use serde::de::IntoDeserializer;
-use std::env;
+use std::{collections::HashMap, env};
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -83,6 +84,10 @@ pub mod defaults {

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

+    #[cfg(target_os = "linux")]
+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
+
+    #[cfg(not(target_os = "linux"))]
    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
@@ -300,6 +305,26 @@ impl<T> BuilderValue<T> {
    }
 }

+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(serde::Deserialize)]
+pub(crate) struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub(crate) postgres_host: String,
+    #[serde(rename = "port")]
+    pub(crate) postgres_port: u16,
+    pub(crate) http_host: String,
+    pub(crate) http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub(crate) other: HashMap<String, serde_json::Value>,
+}
+
 // needed to simplify config construction
 struct PageServerConfigBuilder {
    listen_pg_addr: BuilderValue<String>,
@@ -757,6 +782,10 @@ impl PageServerConf {
        self.workdir.join("deletion")
    }

+    pub fn metadata_path(&self) -> Utf8PathBuf {
+        self.workdir.join("metadata.json")
+    }
+
    pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;

 use futures::Future;
 use pageserver_api::{
+    controller_api::NodeRegisterRequest,
    shard::TenantShardId,
    upcall_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -12,7 +13,10 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, generation::Generation, id::NodeId};

-use crate::config::PageServerConf;
+use crate::{
+    config::{NodeMetadata, PageServerConf},
+    virtual_file::on_fatal_io_error,
+};

 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -32,6 +36,7 @@ pub enum RetryForeverError {
 pub trait ControlPlaneGenerationsApi {
    fn re_attach(
        &self,
+        conf: &PageServerConf,
    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
    fn validate(
        &self,
@@ -110,13 +115,59 @@ impl ControlPlaneClient {

 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    async fn re_attach(
+        &self,
+        conf: &PageServerConf,
+    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
            .expect("Failed to build re-attach path");
+
+        // Include registration content in the re-attach request if a metadata file is readable
+        let metadata_path = conf.metadata_path();
+        let register = match tokio::fs::read_to_string(&metadata_path).await {
+            Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
+                Ok(m) => {
+                    // Since we run one time at startup, be generous in our logging and
+                    // dump all metadata.
+                    tracing::info!(
+                        "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
+                        m.postgres_host,
+                        m.postgres_port,
+                        m.http_host,
+                        m.http_port,
+                        m.other
+                    );
+
+                    Some(NodeRegisterRequest {
+                        node_id: conf.id,
+                        listen_pg_addr: m.postgres_host,
+                        listen_pg_port: m.postgres_port,
+                        listen_http_addr: m.http_host,
+                        listen_http_port: m.http_port,
+                    })
+                }
+                Err(e) => {
+                    tracing::error!("Unreadable metadata in {metadata_path}: {e}");
+                    None
+                }
+            },
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // This is legal: we may have been deployed with some external script
+                    // doing registration for us.
+                    tracing::info!("Metadata file not found at {metadata_path}");
+                } else {
+                    on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
+                }
+                None
+            }
+        };
+
        let request = ReAttachRequest {
            node_id: self.node_id,
+            register,
        };

        fail::fail_point!("control-plane-client-re-attach");
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -831,7 +831,10 @@ mod test {
    }

    impl ControlPlaneGenerationsApi for MockControlPlane {
-        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        async fn re_attach(
+            &self,
+            _conf: &PageServerConf,
+        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
        async fn validate(
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,9 +567,9 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
-  /v1/tenant/{tenant_id}/location_config:
+  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
-      - name: tenant_id
+      - name: tenant_shard_id
        in: path
        required: true
        schema:
@@ -932,6 +932,59 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

+  /v1/tenant/{tenant_shard_id}/heatmap_upload:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        If the location is in an attached mode, upload the current state to the remote heatmap
+      responses:
+        "200":
+          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+  /v1/tenant/{tenant_shard_id}/secondary/download:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        If the location is in secondary mode, download latest heatmap and layers
+      responses:
+        "200":
+          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -1314,10 +1367,11 @@ components:
    TenantLocationConfigRequest:
      type: object
      required:
-        - tenant_id
+        - mode
      properties:
        tenant_id:
          type: string
+          description: Not used, scheduled for removal.
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1391,7 +1445,7 @@ components:
        trace_read_requests:
          type: boolean
        heatmap_period:
-          type: integer
+          type: string
    TenantConfigResponse:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1151,7 +1151,12 @@ async fn tenant_shard_split_handler(

    let new_shards = state
        .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
+        .shard_split(
+            tenant_shard_id,
+            ShardCount::new(req.new_shard_count),
+            req.new_stripe_size,
+            &ctx,
+        )
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -2103,6 +2108,16 @@ where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
 {
+    if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
+        fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
+            "failpoint".into()
+        )));
+
+        fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
+            anyhow::anyhow!("failpoint")
+        )));
+    }
+
    // Spawn a new task to handle the request, to protect the handler from unexpected
    // async cancellations. Most pageserver functions are not async cancellation safe.
    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
@@ -2247,7 +2262,7 @@ pub fn make_router(
        .get("/v1/location_config", |r| {
            api_handler(r, list_location_config_handler)
        })
-        .get("/v1/location_config/:tenant_id", |r| {
+        .get("/v1/location_config/:tenant_shard_id", |r| {
            api_handler(r, get_location_config_handler)
        })
        .put(
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -167,7 +167,7 @@ impl GetVectoredLatency {
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored",
+        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
        &["task_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
@@ -2017,10 +2017,8 @@ impl TimelineMetrics {
    pub(crate) fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
-}

-impl Drop for TimelineMetrics {
-    fn drop(&mut self) {
+    pub(crate) fn shutdown(&self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
@@ -2676,6 +2674,12 @@ pub fn preinitialize_metrics() {
    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
    Lazy::force(&disk_usage_based_eviction::METRICS);

+    for state_name in pageserver_api::models::TenantState::VARIANTS {
+        // initialize the metric for all gauges, otherwise the time series might seemingly show
+        // values from last restart.
+        TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
+    }
+
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1846,6 +1846,8 @@ impl Tenant {
        // Wait for any in-flight operations to complete
        self.gate.close().await;

+        remove_tenant_metrics(&self.tenant_shard_id);
+
        Ok(())
    }

@@ -3557,11 +3559,6 @@ async fn run_initdb(
    Ok(())
 }

-impl Drop for Tenant {
-    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id);
-    }
-}
 /// Dump contents of a layer file to stdout.
 pub async fn dump_layerfile_from_path(
    path: &Utf8Path,
@@ -4628,10 +4625,7 @@ mod tests {
        drop(guard);

        // Pick a big LSN such that we query over all the changes.
-        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
-        // but there seems to be a bug on the non-vectored search path which surfaces
-        // in that case.
-        let reads_lsn = Lsn(u64::MAX - 1000);
+        let reads_lsn = Lsn(u64::MAX - 1);

        for read in reads {
            info!("Doing vectored read on {:?}", read);
@@ -5148,4 +5142,23 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+
+        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let read_lsn = Lsn(u64::MAX - 1);
+
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -354,6 +354,7 @@ pub struct TenantConf {
    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
    /// may be disabled if a Tenant will not have secondary locations: only secondary
    /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
    pub heatmap_period: Duration,

    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -6,7 +6,9 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
-use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::shard::{
+    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -295,7 +297,7 @@ async fn init_load_generations(
    } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
        info!("Calling control plane API to re-attach tenants");
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-        match client.re_attach().await {
+        match client.re_attach(conf).await {
            Ok(tenants) => tenants,
            Err(RetryForeverError::ShuttingDown) => {
                anyhow::bail!("Shut down while waiting for control plane re-attach response")
@@ -1439,11 +1441,41 @@ impl TenantManager {
        &self,
        tenant_shard_id: TenantShardId,
        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let r = self
+            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
+            .await;
+        if r.is_err() {
+            // Shard splitting might have left the original shard in a partially shut down state (it
+            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
+            // a working state.
+            if self.get(tenant_shard_id).is_some() {
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
+                    // Log this error because our return value will still be the original error, not this one.  This is
+                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
+                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
+                    // setting it broken probably won't help either.
+                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
+                }
+            }
+        }
+
+        r
+    }
+
+    pub(crate) async fn do_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
        let tenant = get_tenant(tenant_shard_id, true)?;

-        // Plan: identify what the new child shards will be
+        // Validate the incoming request
        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
            anyhow::bail!("Requested shard count is not an increase");
        }
@@ -1452,10 +1484,18 @@ impl TenantManager {
            anyhow::bail!("Requested split is not a power of two");
        }

-        let parent_shard_identity = tenant.shard_identity;
-        let parent_tenant_conf = tenant.get_tenant_conf();
-        let parent_generation = tenant.generation;
+        if let Some(new_stripe_size) = new_stripe_size {
+            if tenant.get_shard_stripe_size() != new_stripe_size
+                && tenant_shard_id.shard_count.count() > 1
+            {
+                // This tenant already has multiple shards, it is illegal to try and change its stripe size
+                anyhow::bail!(
+                    "Shard stripe size may not be modified once tenant has multiple shards"
+                );
+            }
+        }

+        // Plan: identify what the new child shards will be
        let child_shards = tenant_shard_id.split(new_shard_count);
        tracing::info!(
            "Shard {} splits into: {}",
@@ -1466,6 +1506,14 @@ impl TenantManager {
                .join(",")
        );

+        fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
+        let parent_shard_identity = tenant.shard_identity;
+        let parent_tenant_conf = tenant.get_tenant_conf();
+        let parent_generation = tenant.generation;
+
        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
        if let Err(e) = tenant.split_prepare(&child_shards).await {
            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
@@ -1475,6 +1523,10 @@ impl TenantManager {
            return Err(e);
        }

+        fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        self.resources.deletion_queue_client.flush_advisory();

        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
@@ -1496,11 +1548,16 @@ impl TenantManager {
                anyhow::bail!("Detached parent shard in the middle of split!")
            }
        };
-
+        fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
        // Optimization: hardlink layers from the parent into the children, so that they don't have to
        // re-download & duplicate the data referenced in their initial IndexPart
        self.shard_split_hardlink(parent, child_shards.clone())
            .await?;
+        fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));

        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
        // child shards to reach this point.
@@ -1515,6 +1572,9 @@ impl TenantManager {
        // Phase 3: Spawn the child shards
        for child_shard in &child_shards {
            let mut child_shard_identity = parent_shard_identity;
+            if let Some(new_stripe_size) = new_stripe_size {
+                child_shard_identity.stripe_size = new_stripe_size;
+            }
            child_shard_identity.count = child_shard.shard_count;
            child_shard_identity.number = child_shard.shard_number;

@@ -1537,6 +1597,10 @@ impl TenantManager {
            .await?;
        }

+        fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
        for child_shard_id in &child_shards {
            let child_shard_id = *child_shard_id;
@@ -1569,6 +1633,10 @@ impl TenantManager {
                        timeline.timeline_id,
                        target_lsn
                    );
+
+                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
+                        "failpoint"
+                    )));
                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
@@ -1614,6 +1682,10 @@ impl TenantManager {
            },
        );

+        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        parent_slot_guard.drop_old_value()?;

        // Phase 6: Release the InProgress on the parent shard
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -536,6 +536,18 @@ impl Drop for LayerInner {
            // carry this until we are finished for [`Layer::wait_drop`] support
            let _status = status;

+            let Some(timeline) = timeline.upgrade() else {
+                // no need to nag that timeline is gone: under normal situation on
+                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
+            let Ok(_guard) = timeline.gate.enter() else {
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
            let removed = match std::fs::remove_file(path) {
                Ok(()) => true,
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
@@ -554,32 +566,26 @@ impl Drop for LayerInner {
                }
            };

-            if let Some(timeline) = timeline.upgrade() {
-                if removed {
-                    timeline.metrics.resident_physical_size_sub(file_size);
-                }
-                if let Some(remote_client) = timeline.remote_client.as_ref() {
-                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            if removed {
+                timeline.metrics.resident_physical_size_sub(file_size);
+            }
+            if let Some(remote_client) = timeline.remote_client.as_ref() {
+                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);

-                    if let Err(e) = res {
-                        // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                        // demonstrating this deadlock (without spawn_blocking): stop will drop
-                        // queued items, which will have ResidentLayer's, and those drops would try
-                        // to re-entrantly lock the RemoteTimelineClient inner state.
-                        if !timeline.is_active() {
-                            tracing::info!("scheduling deletion on drop failed: {e:#}");
-                        } else {
-                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                        }
-                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                if let Err(e) = res {
+                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                    // demonstrating this deadlock (without spawn_blocking): stop will drop
+                    // queued items, which will have ResidentLayer's, and those drops would try
+                    // to re-entrantly lock the RemoteTimelineClient inner state.
+                    if !timeline.is_active() {
+                        tracing::info!("scheduling deletion on drop failed: {e:#}");
                    } else {
-                        LAYER_IMPL_METRICS.inc_completed_deletes();
+                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
                    }
+                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                } else {
+                    LAYER_IMPL_METRICS.inc_completed_deletes();
                }
-            } else {
-                // no need to nag that timeline is gone: under normal situation on
-                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
            }
        });
    }
@@ -1095,6 +1101,10 @@ impl LayerInner {
            return Err(EvictionCancelled::TimelineGone);
        };

+        let Ok(_gate) = timeline.gate.enter() else {
+            return Err(EvictionCancelled::TimelineGone);
+        };
+
        // to avoid starting a new download while we evict, keep holding on to the
        // permit.
        let _permit = {
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -130,10 +130,10 @@ where
        self.inner.load().config.steady_rps()
    }

-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
        let inner = self.inner.load_full(); // clones the `Inner` Arc
        if !inner.task_kinds.contains(ctx.task_kind()) {
-            return;
+            return None;
        };
        let start = std::time::Instant::now();
        let mut did_throttle = false;
@@ -170,6 +170,9 @@ where
                    });
                }
            }
+            Some(wait_time)
+        } else {
+            None
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -634,6 +634,8 @@ impl Timeline {
    /// If a remote layer file is needed, it is downloaded as part of this
    /// call.
    ///
+    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    ///
    /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
    /// abstraction above this needs to store suitable metadata to track what
    /// data exists with what keys, in separate metadata entries. If a
@@ -644,18 +646,27 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
+    #[inline(always)]
    pub(crate) async fn get(
        &self,
        key: Key,
        lsn: Lsn,
        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+        self.get_impl(key, lsn, ctx).await
+    }
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if !lsn.is_valid() {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
        }

-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
        // already checked the key against the shard_identity when looking up the Timeline from
        // page_service.
@@ -752,10 +763,6 @@ impl Timeline {
            return Err(GetVectoredError::Oversized(key_count));
        }

-        self.timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
        for range in &keyspace.ranges {
            let mut key = range.start;
            while key != range.end {
@@ -772,11 +779,18 @@ impl Timeline {
            self.conf.get_vectored_impl
        );

-        let _timer = crate::metrics::GET_VECTORED_LATENCY
+        let start = crate::metrics::GET_VECTORED_LATENCY
            .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
+            .map(|metric| (metric, Instant::now()));

-        match self.conf.get_vectored_impl {
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
+        let res = match self.conf.get_vectored_impl {
            GetVectoredImpl::Sequential => {
                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
            }
@@ -790,9 +804,33 @@ impl Timeline {

                vectored_res
            }
+        };
+
+        if let Some((metric, start)) = start {
+            let elapsed = start.elapsed();
+            let ex_throttled = if let Some(throttled) = throttled {
+                elapsed.checked_sub(throttled)
+            } else {
+                Some(elapsed)
+            };
+
+            if let Some(ex_throttled) = ex_throttled {
+                metric.observe(ex_throttled.as_secs_f64());
+            } else {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+            }
        }
+
+        res
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
    pub(super) async fn get_vectored_sequential_impl(
        &self,
        keyspace: KeySpace,
@@ -803,7 +841,7 @@ impl Timeline {
        for range in keyspace.ranges {
            let mut key = range.start;
            while key != range.end {
-                let block = self.get(key, lsn, ctx).await;
+                let block = self.get_impl(key, lsn, ctx).await;

                use PageReconstructError::*;
                match block {
@@ -853,6 +891,7 @@ impl Timeline {
        Ok(results)
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
    pub(super) async fn validate_get_vectored_impl(
        &self,
        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
@@ -1257,6 +1296,8 @@ impl Timeline {

        // Finally wait until any gate-holders are complete
        self.gate.close().await;
+
+        self.metrics.shutdown();
    }

    pub(crate) fn set_state(&self, new_state: TimelineState) {
@@ -2476,7 +2517,7 @@ impl Timeline {
        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
        // to check that each iteration make some progress, to break infinite
        // looping if something goes wrong.
-        let mut prev_lsn = Lsn(u64::MAX);
+        let mut prev_lsn = None;

        let mut result = ValueReconstructResult::Continue;
        let mut cont_lsn = Lsn(request_lsn.0 + 1);
@@ -2496,18 +2537,20 @@ impl Timeline {
                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                        return Ok(traversal_path);
                    }
-                    if prev_lsn <= cont_lsn {
-                        // Didn't make any progress in last iteration. Error out to avoid
-                        // getting stuck in the loop.
-                        return Err(layer_traversal_error(format!(
-                            "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
-                            key,
-                            Lsn(cont_lsn.0 - 1),
-                            request_lsn,
-                            timeline.ancestor_lsn
-                        ), traversal_path));
+                    if let Some(prev) = prev_lsn {
+                        if prev <= cont_lsn {
+                            // Didn't make any progress in last iteration. Error out to avoid
+                            // getting stuck in the loop.
+                            return Err(layer_traversal_error(format!(
+                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                                key,
+                                Lsn(cont_lsn.0 - 1),
+                                request_lsn,
+                                timeline.ancestor_lsn
+                            ), traversal_path));
+                        }
                    }
-                    prev_lsn = cont_lsn;
+                    prev_lsn = Some(cont_lsn);
                }
                ValueReconstructResult::Missing => {
                    return Err(layer_traversal_error(
@@ -2537,7 +2580,7 @@ impl Timeline {

                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
                timeline = &*timeline_owned;
-                prev_lsn = Lsn(u64::MAX);
+                prev_lsn = None;
                continue 'outer;
            }

@@ -2963,7 +3006,6 @@ impl Timeline {
            }

            trace!("waking up");
-            let timer = self.metrics.flush_time_histo.start_timer();
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
                if self.cancel.is_cancelled() {
@@ -2974,6 +3016,8 @@ impl Timeline {
                    return;
                }

+                let timer = self.metrics.flush_time_histo.start_timer();
+
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    guard.layer_map().frozen_layers.front().cloned()
@@ -2995,13 +3039,12 @@ impl Timeline {
                        break err;
                    }
                }
+                timer.stop_and_record();
            };
            // Notify any listeners that we're done
            let _ = self
                .layer_flush_done_tx
                .send_replace((flush_counter, result));
-
-            timer.stop_and_record();
        }
    }

@@ -3069,6 +3112,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<(), FlushLayerError> {
        debug_assert_current_span_has_tenant_and_timeline_id();
+
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
 	struct sysinfo si;
 	Size total;
 	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %n");
+		elog(ERROR, "Failed to get amount of RAM: %m");

 	total = si.totalram*si.mem_unit;
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -38,7 +38,6 @@ neon_download_extension_file_http(const char *filename, bool is_library)

 	CURLcode	res;
 	char	   *compute_ctl_url;
-	char	   *postdata;
 	bool		ret = false;

 	if (handle == NULL)
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
 	uint64_t	us_since_last_connect;
+	bool	broke_from_loop = false;

 	Assert(page_servers[shard_no].conn == NULL);

@@ -418,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel)

 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					return false;
+					/* Returning from inside PG_TRY is bad, so we break/return later */
+					broke_from_loop = true;
+					break;
 				}
 			}
 		}
@@ -431,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	}
 	PG_END_TRY();

+	if (broke_from_loop)
+	{
+		return false;
+	}
+
 	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -95,7 +95,6 @@ get_num_snap_files_lsn_threshold(void)
 	DIR		   *dirdesc;
 	struct dirent *de;
 	char	   *snap_path = "pg_logical/snapshots/";
-	int			cnt = 0;
 	int			lsns_allocated = 1024;
 	int			lsns_num = 0;
 	XLogRecPtr *lsns;
@@ -161,9 +160,6 @@ get_num_snap_files_lsn_threshold(void)
 PGDLLEXPORT void
 LogicalSlotsMonitorMain(Datum main_arg)
 {
-	TimestampTz now,
-				last_checked;
-
 	/* Establish signal handlers. */
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1888,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
 	const PGAlignedBlock buffer = {0};
-	BlockNumber curblocknum = blocknum;
 	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1220,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin
 	req->epochStartLsn = wp->propEpochStartLsn;
 	req->beginLsn = beginLsn;
 	req->endLsn = endLsn;
-	req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	req->commitLsn = wp->commitLsn;
 	req->truncateLsn = wp->truncateLsn;
 	req->proposerId = wp->greetRequest.proposerId;
 }
@@ -1405,7 +1405,7 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;

 	while (true)
@@ -1444,18 +1444,19 @@ RecvAppendResponses(Safekeeper *sk)
 	if (!readAnything)
 		return sk->state == SS_ACTIVE;

-	HandleSafekeeperResponse(wp);
-
+	/* update commit_lsn */
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	/*
-	 * Also send the new commit lsn to all the safekeepers.
+	 * Send the new value to all safekeepers.
 	 */
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	if (minQuorumLsn > wp->lastSentCommitLsn)
+	if (newCommitLsn > wp->commitLsn)
 	{
+		wp->commitLsn = newCommitLsn;
 		BroadcastAppendRequest(wp);
-		wp->lastSentCommitLsn = minQuorumLsn;
 	}

+	HandleSafekeeperResponse(wp);
+
 	return sk->state == SS_ACTIVE;
 }

@@ -1632,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
-	XLogRecPtr	minQuorumLsn;
 	XLogRecPtr	candidateTruncateLsn;

-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
+	wp->api.process_safekeeper_feedback(wp);

 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1649,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp)
 	 * can't commit entries from previous term' in Raft); 2)
 	 */
 	candidateTruncateLsn = CalculateMinFlushLsn(wp);
-	candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
+	candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn);
 	if (candidateTruncateLsn > wp->truncateLsn)
 	{
 		wp->truncateLsn = candidateTruncateLsn;
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -564,7 +564,7 @@ typedef struct walproposer_api
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
+	void		(*process_safekeeper_feedback) (WalProposer *wp);

 	/*
 	 * Write a log message to the internal log processor. This is used only
@@ -646,8 +646,8 @@ typedef struct WalProposer
 	/* WAL has been generated up to this point */
 	XLogRecPtr	availableLsn;

-	/* last commitLsn broadcasted to safekeepers */
-	XLogRecPtr	lastSentCommitLsn;
+	/* cached GetAcknowledgedByQuorumWALPosition result */
+	XLogRecPtr	commitLsn;

 	ProposerGreeting greetRequest;

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
 static const walproposer_api walprop_pg;
+static volatile sig_atomic_t got_SIGUSR2 = false;
+static bool reported_sigusr2 = false;

 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
@@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);

+static void CheckGracefulShutdown(WalProposer *wp);
+
 static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);

 static void
@@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 	BackgroundWorkerUnblockSignals();
 }

+/*
+ * We pretend to be a walsender process, and the lifecycle of a walsender is
+ * slightly different than other procesess. At shutdown, walsender processes
+ * stay alive until the very end, after the checkpointer has written the
+ * shutdown checkpoint. When the checkpointer exits, the postmaster sends all
+ * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send
+ * the remaining WAL, and then exit. This ensures that the checkpoint record
+ * reaches durable storage (in safekeepers), before the server shuts down
+ * completely.
+ */
+static void
+walprop_sigusr2(SIGNAL_ARGS)
+{
+	got_SIGUSR2 = true;
+
+	SetLatch(MyLatch);
+}
+
 static void
 walprop_pg_init_bgworker(void)
 {
@@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void)
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
+	pqsignal(SIGUSR2, walprop_sigusr2);

 	BackgroundWorkerUnblockSignals();

@@ -1026,7 +1049,7 @@ static void
 StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 {
 	XLogRecPtr	FlushPtr;
-	TimeLineID	currTLI;
+	 __attribute__((unused)) TimeLineID	currTLI;

 #if PG_VERSION_NUM < 150000
 	if (ThisTimeLineID == 0)
@@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 #endif

 	/*
-	 * When we first start replication the standby will be behind the primary.
-	 * For some applications, for example synchronous replication, it is
-	 * important to have a clear state for this initial catchup mode, so we
-	 * can trigger actions when we change streaming state later. We may stay
-	 * in this state for a long time, which is exactly why we want to be able
-	 * to monitor whether or not we are still here.
+	 * XXX: Move straight to STOPPING state, skipping the STREAMING state.
+	 *
+	 * This is a bit weird. Normal walsenders stay in STREAMING state, until
+	 * the checkpointer signals them that it is about to start writing the
+	 * shutdown checkpoint. The walsenders acknowledge that they have received
+	 * that signal by switching to STOPPING state. That tells the walsenders
+	 * that they must not write any new WAL.
+	 *
+	 * However, we cannot easily intercept that signal from the checkpointer.
+	 * It's sent by WalSndInitStopping(), using
+	 * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by
+	 * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag.
+	 * However, that's all private to walsender.c.
+	 *
+	 * We don't need to do anything special upon receiving the signal, the
+	 * walproposer doesn't write any WAL anyway, so we skip the STREAMING
+	 * state and go directly to STOPPING mode. That way, the checkpointer
+	 * won't wait for us.
 	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);
+	WalSndSetState(WALSNDSTATE_STOPPING);

 	/*
 	 * Don't allow a request to stream from a future point in WAL that hasn't
@@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 static void
 WalSndLoop(WalProposer *wp)
 {
+	XLogRecPtr	flushPtr;
+
 	/* Clear any already-pending wakeups */
 	ResetLatch(MyLatch);

@@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp)
 		CHECK_FOR_INTERRUPTS();

 		XLogBroadcastWalProposer(wp);
-
-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
 		WalProposerPoll(wp);
 	}
 }
@@ -1230,7 +1264,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 	TimeLineID	timeline;
 	XLogRecPtr	startpos;
 	XLogRecPtr	endpos;
-	uint64		download_range_mb;

 	startpos = GetLogRepRestartLSN(wp);
 	if (startpos == InvalidXLogRecPtr)
@@ -1745,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	{
 		ConditionVariableCancelSleep();
 		ResetLatch(MyLatch);
+
+		CheckGracefulShutdown(wp);
+
 		*events = WL_LATCH_SET;
 		return 1;
 	}
@@ -1798,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 	exit(0);
 }

+/*
+ * Like vanilla walsender, on sigusr2 send all remaining WAL and exit.
+ *
+ * Note that unlike sync-safekeepers waiting here is not reliable: we
+ * don't check that majority of safekeepers received and persisted
+ * commit_lsn -- only that walproposer reached it (which immediately
+ * broadcasts new value). Doing that without incurring redundant control
+ * file syncing would need wp -> sk protocol change. OTOH unlike
+ * sync-safekeepers which must bump commit_lsn or basebackup will fail,
+ * this catchup is important only for tests where safekeepers/network
+ * don't crash on their own.
+ */
+static void
+CheckGracefulShutdown(WalProposer *wp)
+{
+	if (got_SIGUSR2)
+	{
+		if (!reported_sigusr2)
+		{
+			XLogRecPtr	flushPtr = walprop_pg_get_flush_rec_ptr(wp);
+
+			wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X",
+					LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr));
+			reported_sigusr2 = true;
+		}
+
+		if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp))
+		{
+			wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting",
+					LSN_FORMAT_ARGS(wp->commitLsn));
+			proc_exit(0);
+		}
+	}
+}
+
 /*
 * Choose most advanced PageserverFeedback and set it to *rf.
 */
@@ -1878,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 * None of that is functional in sync-safekeepers.
 */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	oldDiskConsistentLsn;
@@ -1893,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 	replication_feedback_set(&quorumFeedback.rf);
 	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);

-	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
-		if (commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = commitLsn;
+		if (wp->commitLsn > quorumFeedback.flushLsn)
+			quorumFeedback.flushLsn = wp->commitLsn;

 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
@@ -1929,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
 								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
+
+	CheckGracefulShutdown(wp);
 }

 static XLogRecPtr
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -182,8 +182,6 @@ test_consume_memory(PG_FUNCTION_ARGS)
 Datum
 test_release_memory(PG_FUNCTION_ARGS)
 {
-	TimestampTz start;
-
 	if (PG_ARGISNULL(0))
 	{
 		if (consume_cxt)
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -220,6 +220,9 @@ enter_seccomp_mode(void)
 }
 #endif /* HAVE_LIBSECCOMP */

+PGDLLEXPORT void
+WalRedoMain(int argc, char *argv[]);
+
 /*
 * Entry point for the WAL redo process.
 *
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -25,13 +25,16 @@ pub async fn authenticate_cleartext(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause();
+    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

-    let auth_outcome = AuthFlow::new(client)
+    let auth_flow = AuthFlow::new(client)
        .begin(auth::CleartextPassword(secret))
-        .await?
-        .authenticate()
        .await?;
+    drop(paused);
+    // cleartext auth is only allowed to the ws/http protocol.
+    // If we're here, we already received the password in the first message.
+    // Scram protocol will be executed on the proxy side.
+    let auth_outcome = auth_flow.authenticate().await?;

    let keys = match auth_outcome {
        sasl::Outcome::Success(key) => key,
@@ -56,7 +59,7 @@ pub async fn password_hack_no_authentication(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause();
+    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -143,7 +143,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        let Scram(secret, ctx) = self.state;

        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause();
+        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -73,7 +73,7 @@ pub mod errors {
                        // Status 406: endpoint is disabled (we don't allow connections).
                        format!("{REQUEST_FAILED}: endpoint is disabled")
                    }
-                    http::StatusCode::LOCKED => {
+                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
                        format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
                    }
@@ -91,6 +91,12 @@ pub mod errors {
                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
                    ..
                } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    text,
+                } if text.contains("compute time quota of non-primary branches is exceeded") => {
+                    crate::error::ErrorKind::User
+                }
                ApiError::Console {
                    status: http::StatusCode::LOCKED,
                    text,
@@ -120,6 +126,11 @@ pub mod errors {
                    status: http::StatusCode::BAD_REQUEST,
                    ..
                } => true,
+                // don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    ref text,
+                } => !text.contains("compute time quota of non-primary branches is exceeded"),
                // locked can be returned when the endpoint was in transition
                // or when quotas are exceeded. don't retry when quotas are exceeded
                Self::Console {
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -6,7 +6,9 @@ use super::{
    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
    NodeInfo,
 };
-use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
+use crate::{
+    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
+};
 use crate::{
    cache::Cached,
    context::RequestMonitoring,
@@ -72,7 +74,9 @@ impl Api {

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
            let response = self.endpoint.execute(request).await?;
+            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
@@ -132,7 +136,9 @@ impl Api {

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
            let response = self.endpoint.execute(request).await?;
+            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
            let body = parse_body::<WakeCompute>(response).await?;

@@ -254,6 +260,7 @@ impl super::Api for Api {
        if permit.should_check_cache() {
            if let Some(cached) = self.caches.node_info.get(&key) {
                info!(key = &*key, "found cached compute node info");
+                ctx.set_cold_start_info(ColdStartInfo::Warm);
                return Ok(cached);
            }
        }
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -15,11 +15,12 @@ use crate::{
    BranchId, DbName, EndpointId, ProjectId, RoleName,
 };

+use self::parquet::RequestData;
+
 pub mod parquet;

-static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
+static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();

-#[derive(Clone)]
 /// Context data for a single request to connect to a database.
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
@@ -46,7 +47,7 @@ pub struct RequestMonitoring {

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
-    sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
+    sender: Option<mpsc::UnboundedSender<RequestData>>,
    pub latency_timer: LatencyTimer,
 }

@@ -111,6 +112,10 @@ impl RequestMonitoring {
        )
    }

+    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+        self.cold_start_info = Some(info);
+    }
+
    pub fn set_project(&mut self, x: MetricsAuxInfo) {
        self.set_endpoint_id(x.endpoint_id);
        self.branch = Some(x.branch_id);
@@ -168,7 +173,7 @@ impl RequestMonitoring {
 impl Drop for RequestMonitoring {
    fn drop(&mut self) {
        if let Some(tx) = self.sender.take() {
-            let _: Result<(), _> = tx.send(self.clone());
+            let _: Result<(), _> = tx.send(RequestData::from(&*self));
        }
    }
 }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 // * after each rowgroup write, we check the length of the file and upload to s3 if large enough

 #[derive(parquet_derive::ParquetRecordWriter)]
-struct RequestData {
+pub struct RequestData {
    region: &'static str,
    protocol: &'static str,
    /// Must be UTC. The derive macro doesn't like the timezones
@@ -93,14 +93,14 @@ struct RequestData {
    /// Or if we make it to proxy_pass
    success: bool,
    /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<String>,
+    cold_start_info: Option<&'static str>,
    /// Tracks time from session start (HTTP request/libpq TCP handshake)
    /// Through to success/failure
    duration_us: u64,
 }

-impl From<RequestMonitoring> for RequestData {
-    fn from(value: RequestMonitoring) -> Self {
+impl From<&RequestMonitoring> for RequestData {
+    fn from(value: &RequestMonitoring) -> Self {
        Self {
            session_id: value.session_id,
            peer_addr: value.peer_addr.to_string(),
@@ -121,10 +121,12 @@ impl From<RequestMonitoring> for RequestData {
            region: value.region,
            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
            success: value.success,
-            cold_start_info: value
-                .cold_start_info
-                .as_ref()
-                .map(|x| serde_json::to_string(x).unwrap_or_default()),
+            cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
+                crate::console::messages::ColdStartInfo::Unknown => "unknown",
+                crate::console::messages::ColdStartInfo::Warm => "warm",
+                crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
+                crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
+            }),
            duration_us: SystemTime::from(value.first_packet)
                .elapsed()
                .unwrap_or_default()
@@ -458,7 +460,7 @@ mod tests {
            region: "us-east-1",
            error: None,
            success: rng.gen(),
-            cold_start_info: Some("no".into()),
+            cold_start_info: Some("no"),
            duration_us: rng.gen_range(0..30_000_000),
        }
    }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -7,7 +7,7 @@ use ::metrics::{
 use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};

 use once_cell::sync::Lazy;
-use tokio::time;
+use tokio::time::{self, Instant};

 pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
    register_int_counter_pair_vec!(
@@ -46,9 +46,9 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "proxy_compute_connection_latency_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
+        // 3 * 2 * 2 * 2 * 2 = 48 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
        // largest bucket = 2^16 * 0.5ms = 32s
        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
@@ -161,12 +161,26 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

-#[derive(Clone)]
+pub enum Waiting {
+    Cplane,
+    Client,
+    Compute,
+}
+
+#[derive(Default)]
+struct Accumulated {
+    cplane: time::Duration,
+    client: time::Duration,
+    compute: time::Duration,
+}
+
 pub struct LatencyTimer {
    // time since the stopwatch was started
-    start: Option<time::Instant>,
+    start: time::Instant,
+    // time since the stopwatch was stopped
+    stop: Option<time::Instant>,
    // accumulated time on the stopwatch
-    pub accumulated: std::time::Duration,
+    accumulated: Accumulated,
    // label data
    protocol: &'static str,
    cache_miss: bool,
@@ -176,13 +190,16 @@ pub struct LatencyTimer {

 pub struct LatencyTimerPause<'a> {
    timer: &'a mut LatencyTimer,
+    start: time::Instant,
+    waiting_for: Waiting,
 }

 impl LatencyTimer {
    pub fn new(protocol: &'static str) -> Self {
        Self {
-            start: Some(time::Instant::now()),
-            accumulated: std::time::Duration::ZERO,
+            start: time::Instant::now(),
+            stop: None,
+            accumulated: Accumulated::default(),
            protocol,
            cache_miss: false,
            // by default we don't do pooling
@@ -192,11 +209,12 @@ impl LatencyTimer {
        }
    }

-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
+    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
+        LatencyTimerPause {
+            timer: self,
+            start: Instant::now(),
+            waiting_for,
+        }
    }

    pub fn cache_miss(&mut self) {
@@ -209,9 +227,7 @@ impl LatencyTimer {

    pub fn success(&mut self) {
        // stop the stopwatch and record the time that we have accumulated
-        if let Some(start) = self.start.take() {
-            self.accumulated += start.elapsed();
-        }
+        self.stop = Some(time::Instant::now());

        // success
        self.outcome = "success";
@@ -220,23 +236,42 @@ impl LatencyTimer {

 impl Drop for LatencyTimerPause<'_> {
    fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(time::Instant::now());
+        let dur = self.start.elapsed();
+        match self.waiting_for {
+            Waiting::Cplane => self.timer.accumulated.cplane += dur,
+            Waiting::Client => self.timer.accumulated.client += dur,
+            Waiting::Compute => self.timer.accumulated.compute += dur,
+        }
    }
 }

 impl Drop for LatencyTimer {
    fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        let duration = self
+            .stop
+            .unwrap_or_else(time::Instant::now)
+            .duration_since(self.start);
+        // Excluding cplane communication from the accumulated time.
        COMPUTE_CONNECTION_LATENCY
            .with_label_values(&[
                self.protocol,
                bool_to_str(self.cache_miss),
                bool_to_str(self.pool_miss),
                self.outcome,
+                "client",
            ])
-            .observe(duration.as_secs_f64())
+            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+        // Exclude client and cplane communication from the accumulated time.
+        let accumulated_total = self.accumulated.client + self.accumulated.cplane;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+                "client_and_cplane",
+            ])
+            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
    }
 }

--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -17,7 +17,7 @@ use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 use uuid::Uuid;

-use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
+use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;

 pub struct ProxyProtocolAccept {
    pub incoming: AddrIncoming,
@@ -331,17 +331,24 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
    }
 }

-impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
+impl Accept for ProxyProtocolAccept {
+    type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;

    type Error = io::Error;

    fn poll_accept(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
+
+        let conn_id = uuid::Uuid::new_v4();
+        let span = tracing::info_span!("http_conn", ?conn_id);
+        {
+            let _enter = span.enter();
+            tracing::info!("accepted new TCP connection");
+        }
+
        let Some(conn) = conn else {
            return Poll::Ready(None);
        };
@@ -354,6 +361,7 @@ impl AsyncAccept for ProxyProtocolAccept {
                    .with_label_values(&[self.protocol])
                    .guard(),
            )),
+            span,
        })))
    }
 }
@@ -364,6 +372,14 @@ pin_project! {
        pub inner: T,
        pub connection_id: Uuid,
        pub gauge: Mutex<Option<IntCounterPairGuard>>,
+        pub span: tracing::Span,
+    }
+
+    impl<S> PinnedDrop for WithConnectionGuard<S> {
+        fn drop(this: Pin<&mut Self>) {
+            let _enter = this.span.enter();
+            tracing::info!("HTTP connection closed")
+        }
    }
 }

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -248,7 +248,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(

    let tls = config.tls_config.as_ref();

-    let pause = ctx.latency_timer.pause();
+    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
    let do_handshake = handshake(stream, mode.handshake_tls(tls));
    let (mut stream, params) =
        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -69,6 +69,12 @@ fn report_error(e: &WakeComputeError, retry: bool) {
        {
            "quota_exceeded"
        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::UNPROCESSABLE_ENTITY,
+            ref text,
+        }) if text.contains("compute time quota of non-primary branches is exceeded") => {
+            "quota_exceeded"
+        }
        WakeComputeError::ApiError(ApiError::Console {
            status: StatusCode::LOCKED,
            ..
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -19,29 +19,24 @@ use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
+use tracing::instrument::Instrumented;

 use crate::context::RequestMonitoring;
-use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
-use futures::StreamExt;
 use hyper::{
-    server::{
-        accept,
-        conn::{AddrIncoming, AddrStream},
-    },
+    server::conn::{AddrIncoming, AddrStream},
    Body, Method, Request, Response,
 };

-use std::convert::Infallible;
 use std::net::IpAddr;
+use std::sync::Arc;
 use std::task::Poll;
-use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
-use tokio_util::sync::CancellationToken;
+use tokio_util::sync::{CancellationToken, DropGuard};
 use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

@@ -105,19 +100,7 @@ pub async fn task_main(
    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
    ws_connections.close(); // allows `ws_connections.wait to complete`

-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
-        if let Err(err) = conn {
-            error!(
-                protocol = "http",
-                "failed to accept TLS connection: {err:?}"
-            );
-            TLS_HANDSHAKE_FAILURES.inc();
-            ready(false)
-        } else {
-            info!(protocol = "http", "accepted new TLS connection");
-            ready(true)
-        }
-    });
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);

    let make_svc = hyper::service::make_service_fn(
        |stream: &tokio_rustls::server::TlsStream<
@@ -133,6 +116,11 @@ pub async fn task_main(
                .take()
                .expect("gauge should be set on connection start");

+            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+            let http_cancellation_token = CancellationToken::new();
+            let cancel_connection = http_cancellation_token.clone().drop_guard();
+
+            let span = conn.span.clone();
            let client_addr = conn.inner.client_addr();
            let remote_addr = conn.inner.inner.remote_addr();
            let backend = backend.clone();
@@ -148,33 +136,49 @@ pub async fn task_main(
                Ok(MetricService::new(
                    hyper::service::service_fn(move |req: Request<Body>| {
                        let backend = backend.clone();
-                        let ws_connections = ws_connections.clone();
+                        let ws_connections2 = ws_connections.clone();
                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
                        let cancellation_handler = cancellation_handler.clone();
+                        let http_cancellation_token = http_cancellation_token.child_token();

-                        async move {
-                            Ok::<_, Infallible>(
-                                request_handler(
+                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+                        // By spawning the future, we ensure it never gets cancelled until it decides to.
+                        ws_connections.spawn(
+                            async move {
+                                // Cancel the current inflight HTTP request if the requets stream is closed.
+                                // This is slightly different to `_cancel_connection` in that
+                                // h2 can cancel individual requests with a `RST_STREAM`.
+                                let _cancel_session = http_cancellation_token.clone().drop_guard();
+
+                                let res = request_handler(
                                    req,
                                    config,
                                    backend,
-                                    ws_connections,
+                                    ws_connections2,
                                    cancellation_handler,
                                    peer_addr.ip(),
                                    endpoint_rate_limiter,
+                                    http_cancellation_token,
                                )
                                .await
-                                .map_or_else(|e| e.into_response(), |r| r),
-                            )
-                        }
+                                .map_or_else(|e| e.into_response(), |r| r);
+
+                                _cancel_session.disarm();
+
+                                res
+                            }
+                            .in_current_span(),
+                        )
                    }),
                    gauge,
+                    cancel_connection,
+                    span,
                ))
            }
        },
    );

-    hyper::Server::builder(accept::from_stream(tls_listener))
+    hyper::Server::builder(tls_listener)
        .serve(make_svc)
        .with_graceful_shutdown(cancellation_token.cancelled())
        .await?;
@@ -188,11 +192,23 @@ pub async fn task_main(
 struct MetricService<S> {
    inner: S,
    _gauge: IntCounterPairGuard,
+    _cancel: DropGuard,
+    span: tracing::Span,
 }

 impl<S> MetricService<S> {
-    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
-        MetricService { inner, _gauge }
+    fn new(
+        inner: S,
+        _gauge: IntCounterPairGuard,
+        _cancel: DropGuard,
+        span: tracing::Span,
+    ) -> MetricService<S> {
+        MetricService {
+            inner,
+            _gauge,
+            _cancel,
+            span,
+        }
    }
 }

@@ -202,14 +218,16 @@ where
 {
    type Response = S::Response;
    type Error = S::Error;
-    type Future = S::Future;
+    type Future = Instrumented<S::Future>;

    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
        self.inner.poll_ready(cx)
    }

    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.inner.call(req)
+        self.span
+            .in_scope(|| self.inner.call(req))
+            .instrument(self.span.clone())
    }
 }

@@ -222,6 +240,8 @@ async fn request_handler(
    cancellation_handler: Arc<CancellationHandler>,
    peer_addr: IpAddr,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    // used to cancel in-flight HTTP requests. not used to cancel websockets
+    http_cancellation_token: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let session_id = uuid::Uuid::new_v4();

@@ -265,7 +285,7 @@ async fn request_handler(
        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
        let span = ctx.span.clone();

-        sql_over_http::handle(config, ctx, request, backend)
+        sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
            .instrument(span)
            .await
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,9 +9,11 @@ use crate::{
    config::ProxyConfig,
    console::{
        errors::{GetAuthInfoError, WakeComputeError},
+        messages::ColdStartInfo,
        CachedNodeInfo,
    },
    context::RequestMonitoring,
+    error::{ErrorKind, ReportableError, UserFacingError},
    proxy::connect_compute::ConnectMechanism,
 };

@@ -82,6 +84,7 @@ impl PoolingBackend {
        };

        if let Some(client) = maybe_client {
+            ctx.set_cold_start_info(ColdStartInfo::Warm);
            return Ok(client);
        }
        let conn_id = uuid::Uuid::new_v4();
@@ -117,6 +120,30 @@ pub enum HttpConnError {
    WakeCompute(#[from] WakeComputeError),
 }

+impl ReportableError for HttpConnError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
+            HttpConnError::ConnectionError(p) => p.get_error_kind(),
+            HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
+            HttpConnError::AuthError(a) => a.get_error_kind(),
+            HttpConnError::WakeCompute(w) => w.get_error_kind(),
+        }
+    }
+}
+
+impl UserFacingError for HttpConnError {
+    fn to_string_client(&self) -> String {
+        match self {
+            HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
+            HttpConnError::ConnectionError(p) => p.to_string(),
+            HttpConnError::GetAuthInfo(c) => c.to_string_client(),
+            HttpConnError::AuthError(c) => c.to_string_client(),
+            HttpConnError::WakeCompute(c) => c.to_string_client(),
+        }
+    }
+}
+
 struct TokioMechanism {
    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
    conn_info: ConnInfo,
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -119,16 +119,12 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
        }
    }

-    fn put(
-        pool: &RwLock<Self>,
-        conn_info: &ConnInfo,
-        client: ClientInner<C>,
-    ) -> anyhow::Result<()> {
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
        let conn_id = client.conn_id;

        if client.is_closed() {
            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
+            return;
        }
        let global_max_conn = pool.read().global_pool_size_max_conns;
        if pool
@@ -138,7 +134,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
            >= global_max_conn
        {
            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-            return Ok(());
+            return;
        }

        // return connection to the pool
@@ -172,8 +168,6 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
        } else {
            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
        }
-
-        Ok(())
    }
 }

@@ -612,13 +606,6 @@ impl<C: ClientInnerExt> Client<C> {
        let inner = inner.as_mut().expect("client inner should not be removed");
        (&mut inner.inner, Discard { pool, conn_info })
    }
-
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        self.inner().1.check_idle(status)
-    }
-    pub fn discard(&mut self) {
-        self.inner().1.discard()
-    }
 }

 impl<C: ClientInnerExt> Discard<'_, C> {
@@ -660,7 +647,7 @@ impl<C: ClientInnerExt> Client<C> {
            // return connection to the pool
            return Some(move || {
                let _span = current_span.enter();
-                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
            });
        }
        None
@@ -739,7 +726,7 @@ mod tests {
        {
            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
            assert_eq!(0, pool.get_global_connections_count());
-            client.discard();
+            client.inner().1.discard();
            // Discard should not add the connection from the pool.
            assert_eq!(0, pool.get_global_connections_count());
        }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,11 @@
+use std::pin::pin;
 use std::sync::Arc;

-use anyhow::bail;
+use futures::future::select;
+use futures::future::try_join;
+use futures::future::Either;
 use futures::StreamExt;
+use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::header;
 use hyper::http::HeaderName;
@@ -11,13 +15,16 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::try_join;
+use tokio::time;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
+use tokio_postgres::error::SqlState;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
+use tokio_postgres::NoTls;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
+use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing::info;
 use url::Url;
@@ -30,9 +37,13 @@ use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::error::ErrorKind;
+use crate::error::ReportableError;
+use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;

@@ -40,6 +51,7 @@ use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
+use super::json::JsonConversionError;

 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -110,6 +122,18 @@ pub enum ConnInfoError {
    MalformedEndpoint,
 }

+impl ReportableError for ConnInfoError {
+    fn get_error_kind(&self) -> ErrorKind {
+        ErrorKind::User
+    }
+}
+
+impl UserFacingError for ConnInfoError {
+    fn to_string_client(&self) -> String {
+        self.to_string()
+    }
+}
+
 fn get_conn_info(
    ctx: &mut RequestMonitoring,
    headers: &HeaderMap,
@@ -193,109 +217,124 @@ pub async fn handle(
    mut ctx: RequestMonitoring,
    request: Request<Body>,
    backend: Arc<PoolingBackend>,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let result = tokio::time::timeout(
-        config.http_config.request_timeout,
-        handle_inner(config, &mut ctx, request, backend),
-    )
-    .await;
+    let cancel2 = cancel.clone();
+    let handle = tokio::spawn(async move {
+        time::sleep(config.http_config.request_timeout).await;
+        cancel2.cancel();
+    });
+
+    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    handle.abort();
+
    let mut response = match result {
-        Ok(r) => match r {
-            Ok(r) => {
-                ctx.set_success();
-                r
-            }
-            Err(e) => {
-                // TODO: ctx.set_error_kind(e.get_error_type());
-
-                let mut message = format!("{:?}", e);
-                let db_error = e
-                    .downcast_ref::<tokio_postgres::Error>()
-                    .and_then(|e| e.as_db_error());
-                fn get<'a, T: serde::Serialize>(
-                    db: Option<&'a DbError>,
-                    x: impl FnOnce(&'a DbError) -> T,
-                ) -> Value {
-                    db.map(x)
-                        .and_then(|t| serde_json::to_value(t).ok())
-                        .unwrap_or_default()
-                }
-
-                if let Some(db_error) = db_error {
-                    db_error.message().clone_into(&mut message);
-                }
-
-                let position = db_error.and_then(|db| db.position());
-                let (position, internal_position, internal_query) = match position {
-                    Some(ErrorPosition::Original(position)) => (
-                        Value::String(position.to_string()),
-                        Value::Null,
-                        Value::Null,
-                    ),
-                    Some(ErrorPosition::Internal { position, query }) => (
-                        Value::Null,
-                        Value::String(position.to_string()),
-                        Value::String(query.clone()),
-                    ),
-                    None => (Value::Null, Value::Null, Value::Null),
-                };
-
-                let code = get(db_error, |db| db.code().code());
-                let severity = get(db_error, |db| db.severity());
-                let detail = get(db_error, |db| db.detail());
-                let hint = get(db_error, |db| db.hint());
-                let where_ = get(db_error, |db| db.where_());
-                let table = get(db_error, |db| db.table());
-                let column = get(db_error, |db| db.column());
-                let schema = get(db_error, |db| db.schema());
-                let datatype = get(db_error, |db| db.datatype());
-                let constraint = get(db_error, |db| db.constraint());
-                let file = get(db_error, |db| db.file());
-                let line = get(db_error, |db| db.line().map(|l| l.to_string()));
-                let routine = get(db_error, |db| db.routine());
-
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                // TODO: this shouldn't always be bad request.
-                json_response(
-                    StatusCode::BAD_REQUEST,
-                    json!({
-                        "message": message,
-                        "code": code,
-                        "detail": detail,
-                        "hint": hint,
-                        "position": position,
-                        "internalPosition": internal_position,
-                        "internalQuery": internal_query,
-                        "severity": severity,
-                        "where": where_,
-                        "table": table,
-                        "column": column,
-                        "schema": schema,
-                        "dataType": datatype,
-                        "constraint": constraint,
-                        "file": file,
-                        "line": line,
-                        "routine": routine,
-                    }),
-                )?
-            }
-        },
-        Err(_) => {
-            // TODO: when http error classification is done, distinguish between
-            // timeout on sql vs timeout in proxy/cplane
-            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
+        Ok(r) => {
+            ctx.set_success();
+            r
+        }
+        Err(e @ SqlOverHttpError::Cancelled(_)) => {
+            let error_kind = e.get_error_kind();
+            ctx.set_error_kind(error_kind);

            let message = format!(
-                "HTTP-Connection timed out, execution time exceeded {} seconds",
-                config.http_config.request_timeout.as_secs()
+                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
+                config.http_config.request_timeout.as_secs_f64()
            );
-            error!(message);
+
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%e,
+                msg=message,
+                "forwarding error to user"
+            );
+
            json_response(
-                StatusCode::GATEWAY_TIMEOUT,
-                json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
+            )?
+        }
+        Err(e) => {
+            let error_kind = e.get_error_kind();
+            ctx.set_error_kind(error_kind);
+
+            let mut message = e.to_string_client();
+            let db_error = match &e {
+                SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                _ => None,
+            };
+            fn get<'a, T: serde::Serialize>(
+                db: Option<&'a DbError>,
+                x: impl FnOnce(&'a DbError) -> T,
+            ) -> Value {
+                db.map(x)
+                    .and_then(|t| serde_json::to_value(t).ok())
+                    .unwrap_or_default()
+            }
+
+            if let Some(db_error) = db_error {
+                db_error.message().clone_into(&mut message);
+            }
+
+            let position = db_error.and_then(|db| db.position());
+            let (position, internal_position, internal_query) = match position {
+                Some(ErrorPosition::Original(position)) => (
+                    Value::String(position.to_string()),
+                    Value::Null,
+                    Value::Null,
+                ),
+                Some(ErrorPosition::Internal { position, query }) => (
+                    Value::Null,
+                    Value::String(position.to_string()),
+                    Value::String(query.clone()),
+                ),
+                None => (Value::Null, Value::Null, Value::Null),
+            };
+
+            let code = get(db_error, |db| db.code().code());
+            let severity = get(db_error, |db| db.severity());
+            let detail = get(db_error, |db| db.detail());
+            let hint = get(db_error, |db| db.hint());
+            let where_ = get(db_error, |db| db.where_());
+            let table = get(db_error, |db| db.table());
+            let column = get(db_error, |db| db.column());
+            let schema = get(db_error, |db| db.schema());
+            let datatype = get(db_error, |db| db.datatype());
+            let constraint = get(db_error, |db| db.constraint());
+            let file = get(db_error, |db| db.file());
+            let line = get(db_error, |db| db.line().map(|l| l.to_string()));
+            let routine = get(db_error, |db| db.routine());
+
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%e,
+                msg=message,
+                "forwarding error to user"
+            );
+
+            // TODO: this shouldn't always be bad request.
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({
+                    "message": message,
+                    "code": code,
+                    "detail": detail,
+                    "hint": hint,
+                    "position": position,
+                    "internalPosition": internal_position,
+                    "internalQuery": internal_query,
+                    "severity": severity,
+                    "where": where_,
+                    "table": table,
+                    "column": column,
+                    "schema": schema,
+                    "dataType": datatype,
+                    "constraint": constraint,
+                    "file": file,
+                    "line": line,
+                    "routine": routine,
+                }),
            )?
        }
    };
@@ -307,12 +346,101 @@ pub async fn handle(
    Ok(response)
 }

+#[derive(Debug, thiserror::Error)]
+pub enum SqlOverHttpError {
+    #[error("{0}")]
+    ReadPayload(#[from] ReadPayloadError),
+    #[error("{0}")]
+    ConnectCompute(#[from] HttpConnError),
+    #[error("{0}")]
+    ConnInfo(#[from] ConnInfoError),
+    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
+    RequestTooLarge,
+    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
+    ResponseTooLarge,
+    #[error("invalid isolation level")]
+    InvalidIsolationLevel,
+    #[error("{0}")]
+    Postgres(#[from] tokio_postgres::Error),
+    #[error("{0}")]
+    JsonConversion(#[from] JsonConversionError),
+    #[error("{0}")]
+    Cancelled(SqlOverHttpCancel),
+}
+
+impl ReportableError for SqlOverHttpError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
+            SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
+            SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
+            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
+            SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
+            SqlOverHttpError::Postgres(p) => p.get_error_kind(),
+            SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
+            SqlOverHttpError::Cancelled(c) => c.get_error_kind(),
+        }
+    }
+}
+
+impl UserFacingError for SqlOverHttpError {
+    fn to_string_client(&self) -> String {
+        match self {
+            SqlOverHttpError::ReadPayload(p) => p.to_string(),
+            SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
+            SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
+            SqlOverHttpError::RequestTooLarge => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge => self.to_string(),
+            SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
+            SqlOverHttpError::Postgres(p) => p.to_string(),
+            SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
+            SqlOverHttpError::Cancelled(_) => self.to_string(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ReadPayloadError {
+    #[error("could not read the HTTP request body: {0}")]
+    Read(#[from] hyper::Error),
+    #[error("could not parse the HTTP request body: {0}")]
+    Parse(#[from] serde_json::Error),
+}
+
+impl ReportableError for ReadPayloadError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect,
+            ReadPayloadError::Parse(_) => ErrorKind::User,
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SqlOverHttpCancel {
+    #[error("query was cancelled")]
+    Postgres,
+    #[error("query was cancelled while stuck trying to connect to the database")]
+    Connect,
+}
+
+impl ReportableError for SqlOverHttpCancel {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
+            SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
+        }
+    }
+}
+
 async fn handle_inner(
+    cancel: CancellationToken,
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
    request: Request<Body>,
    backend: Arc<PoolingBackend>,
-) -> anyhow::Result<Response<Body>> {
+) -> Result<Response<Body>, SqlOverHttpError> {
    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
        .with_label_values(&[ctx.protocol])
        .guard();
@@ -345,7 +473,7 @@ async fn handle_inner(
            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
            b"ReadCommitted" => IsolationLevel::ReadCommitted,
            b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => bail!("invalid isolation level"),
+            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
        }),
        None => None,
    };
@@ -363,19 +491,16 @@ async fn handle_inner(
    // we don't have a streaming request support yet so this is to prevent OOM
    // from a malicious user sending an extremely large request body
    if request_content_length > MAX_REQUEST_SIZE {
-        return Err(anyhow::anyhow!(
-            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
-        ));
+        return Err(SqlOverHttpError::RequestTooLarge);
    }

    let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body())
-            .await
-            .map_err(anyhow::Error::from)?;
+        let body = hyper::body::to_bytes(request.into_body()).await?;
        info!(length = body.len(), "request payload read");
        let payload: Payload = serde_json::from_slice(&body)?;
-        Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
-    };
+        Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
+    }
+    .map_err(SqlOverHttpError::from);

    let authenticate_and_connect = async {
        let keys = backend.authenticate(ctx, &conn_info).await?;
@@ -385,11 +510,25 @@ async fn handle_inner(
        // not strictly necessary to mark success here,
        // but it's just insurance for if we forget it somewhere else
        ctx.latency_timer.success();
-        Ok::<_, anyhow::Error>(client)
-    };
+        Ok::<_, HttpConnError>(client)
+    }
+    .map_err(SqlOverHttpError::from);

    // Run both operations in parallel
-    let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
+    let (payload, mut client) = match select(
+        try_join(
+            pin!(fetch_and_process_request),
+            pin!(authenticate_and_connect),
+        ),
+        pin!(cancel.cancelled()),
+    )
+    .await
+    {
+        Either::Left((result, _cancelled)) => result?,
+        Either::Right((_cancelled, _)) => {
+            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
+        }
+    };

    let mut response = Response::builder()
        .status(StatusCode::OK)
@@ -401,19 +540,64 @@ async fn handle_inner(
    let mut size = 0;
    let result = match payload {
        Payload::Single(stmt) => {
-            let (status, results) =
-                query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
-                    .await
-                    .map_err(|e| {
-                        client.discard();
-                        e
-                    })?;
-            client.check_idle(status);
-            results
+            let mut size = 0;
+            let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
+            let query = pin!(query_to_json(
+                &*inner,
+                stmt,
+                &mut size,
+                raw_output,
+                default_array_mode
+            ));
+            let cancelled = pin!(cancel.cancelled());
+            let res = select(query, cancelled).await;
+            match res {
+                Either::Left((Ok((status, results)), _cancelled)) => {
+                    discard.check_idle(status);
+                    results
+                }
+                Either::Left((Err(e), _cancelled)) => {
+                    discard.discard();
+                    return Err(e);
+                }
+                Either::Right((_cancelled, query)) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    match time::timeout(time::Duration::from_millis(100), query).await {
+                        Ok(Ok((status, results))) => {
+                            discard.check_idle(status);
+                            results
+                        }
+                        Ok(Err(error)) => {
+                            let db_error = match &error {
+                                SqlOverHttpError::ConnectCompute(
+                                    HttpConnError::ConnectionError(e),
+                                )
+                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                                _ => None,
+                            };
+
+                            // if errored for some other reason, it might not be safe to return
+                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
+                                discard.discard();
+                            }
+
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+                        }
+                        Err(_timeout) => {
+                            discard.discard();
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+                        }
+                    }
+                }
+            }
        }
        Payload::Batch(statements) => {
            info!("starting transaction");
            let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
            let mut builder = inner.build_transaction();
            if let Some(isolation_level) = txn_isolation_level {
                builder = builder.isolation_level(isolation_level);
@@ -433,6 +617,7 @@ async fn handle_inner(
            })?;

            let results = match query_batch(
+                cancel.child_token(),
                &transaction,
                statements,
                &mut size,
@@ -452,6 +637,15 @@ async fn handle_inner(
                    discard.check_idle(status);
                    results
                }
+                Err(SqlOverHttpError::Cancelled(_)) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+                    discard.discard();
+
+                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+                }
                Err(err) => {
                    info!("rollback");
                    let status = transaction.rollback().await.map_err(|e| {
@@ -466,16 +660,10 @@ async fn handle_inner(
            };

            if txn_read_only {
-                response = response.header(
-                    TXN_READ_ONLY.clone(),
-                    HeaderValue::try_from(txn_read_only.to_string())?,
-                );
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
            }
            if txn_deferrable {
-                response = response.header(
-                    TXN_DEFERRABLE.clone(),
-                    HeaderValue::try_from(txn_deferrable.to_string())?,
-                );
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
            }
            if let Some(txn_isolation_level) = txn_isolation_level_raw {
                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
@@ -503,19 +691,37 @@ async fn handle_inner(
 }

 async fn query_batch(
+    cancel: CancellationToken,
    transaction: &Transaction<'_>,
    queries: BatchQueryData,
    total_size: &mut usize,
    raw_output: bool,
    array_mode: bool,
-) -> anyhow::Result<Vec<Value>> {
+) -> Result<Vec<Value>, SqlOverHttpError> {
    let mut results = Vec::with_capacity(queries.queries.len());
    let mut current_size = 0;
    for stmt in queries.queries {
-        // TODO: maybe we should check that the transaction bit is set here
-        let (_, values) =
-            query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?;
-        results.push(values);
+        let query = pin!(query_to_json(
+            transaction,
+            stmt,
+            &mut current_size,
+            raw_output,
+            array_mode
+        ));
+        let cancelled = pin!(cancel.cancelled());
+        let res = select(query, cancelled).await;
+        match res {
+            // TODO: maybe we should check that the transaction bit is set here
+            Either::Left((Ok((_, values)), _cancelled)) => {
+                results.push(values);
+            }
+            Either::Left((Err(e), _cancelled)) => {
+                return Err(e);
+            }
+            Either::Right((_cancelled, _)) => {
+                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+            }
+        }
    }
    *total_size += current_size;
    Ok(results)
@@ -527,7 +733,7 @@ async fn query_to_json<T: GenericClient>(
    current_size: &mut usize,
    raw_output: bool,
    default_array_mode: bool,
-) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
    info!("executing query");
    let query_params = data.params;
    let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -544,9 +750,7 @@ async fn query_to_json<T: GenericClient>(
        // we don't have a streaming response support yet so this is to prevent OOM
        // from a malicious query (eg a cross join)
        if *current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
+            return Err(SqlOverHttpError::ResponseTooLarge);
        }
    }

--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -1,186 +1,106 @@
 use std::{
+    convert::Infallible,
    pin::Pin,
    task::{Context, Poll},
    time::Duration,
 };

-use futures::{Future, Stream, StreamExt};
+use hyper::server::{accept::Accept, conn::AddrStream};
 use pin_project_lite::pin_project;
-use thiserror::Error;
 use tokio::{
    io::{AsyncRead, AsyncWrite},
    task::JoinSet,
    time::timeout,
 };
+use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tracing::{info, warn, Instrument};

-/// Default timeout for the TLS handshake.
-pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
-
-/// Trait for TLS implementation.
-///
-/// Implementations are provided by the rustls and native-tls features.
-pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
-    /// The type of the TLS stream created from the underlying stream.
-    type Stream: Send + 'static;
-    /// Error type for completing the TLS handshake
-    type Error: std::error::Error + Send + 'static;
-    /// Type of the Future for the TLS stream that is accepted.
-    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
-
-    /// Accept a TLS connection on an underlying stream
-    fn accept(&self, stream: C) -> Self::AcceptFuture;
-}
-
-/// Asynchronously accept connections.
-pub trait AsyncAccept {
-    /// The type of the connection that is accepted.
-    type Connection: AsyncRead + AsyncWrite;
-    /// The type of error that may be returned.
-    type Error;
-
-    /// Poll to accept the next connection.
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
-
-    /// Return a new `AsyncAccept` that stops accepting connections after
-    /// `ender` completes.
-    ///
-    /// Useful for graceful shutdown.
-    ///
-    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
-    /// for example of how to use.
-    fn until<F: Future>(self, ender: F) -> Until<Self, F>
-    where
-        Self: Sized,
-    {
-        Until {
-            acceptor: self,
-            ender,
-        }
-    }
-}
+use crate::{
+    metrics::TLS_HANDSHAKE_FAILURES,
+    protocol2::{WithClientIp, WithConnectionGuard},
+};

 pin_project! {
-    ///
    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
    /// encrypted using TLS.
-    ///
-    /// It is similar to:
-    ///
-    /// ```ignore
-    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
-    /// ```
-    ///
-    /// except that it has the ability to accept multiple transport-level connections
-    /// simultaneously while the TLS handshake is pending for other connections.
-    ///
-    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
-    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
-    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
-    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
-    ///
-    /// Note that if the maximum number of pending connections is greater than 1, the resulting
-    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
-    /// underlying listener.
-    ///
-    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
-    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
-    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
-    /// [4]: AsyncTls::Stream
-    ///
-    #[allow(clippy::type_complexity)]
-    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
+    pub(crate) struct TlsListener<A: Accept> {
        #[pin]
        listener: A,
-        tls: T,
-        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
+        tls: TlsAcceptor,
+        waiting: JoinSet<Option<TlsStream<A::Conn>>>,
        timeout: Duration,
    }
 }

-/// Builder for `TlsListener`.
-#[derive(Clone)]
-pub struct Builder<T> {
-    tls: T,
-    handshake_timeout: Duration,
-}
-
-/// Wraps errors from either the listener or the TLS Acceptor
-#[derive(Debug, Error)]
-pub enum Error<LE: std::error::Error, TE: std::error::Error> {
-    /// An error that arose from the listener ([AsyncAccept::Error])
-    #[error("{0}")]
-    ListenerError(#[source] LE),
-    /// An error that occurred during the TLS accept handshake
-    #[error("{0}")]
-    TlsAcceptError(#[source] TE),
-}
-
-impl<A: AsyncAccept, T> TlsListener<A, T>
-where
-    T: AsyncTls<A::Connection>,
-{
+impl<A: Accept> TlsListener<A> {
    /// Create a `TlsListener` with default options.
-    pub fn new(tls: T, listener: A) -> Self {
-        builder(tls).listen(listener)
+    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
+        TlsListener {
+            listener,
+            tls,
+            waiting: JoinSet::new(),
+            timeout,
+        }
    }
 }

-impl<A, T> TlsListener<A, T>
+impl<A> Accept for TlsListener<A>
 where
-    A: AsyncAccept,
+    A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
    A::Error: std::error::Error,
-    T: AsyncTls<A::Connection>,
+    A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
 {
-    /// Accept the next connection
-    ///
-    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
-    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
-    where
-        Self: Unpin,
-    {
-        self.next().await
-    }
+    type Conn = TlsStream<A::Conn>;

-    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
-    ///
-    /// This can be used to change the certificate used at runtime.
-    pub fn replace_acceptor(&mut self, acceptor: T) {
-        self.tls = acceptor;
-    }
+    type Error = Infallible;

-    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
-    ///
-    /// This is useful if your listener is `!Unpin`.
-    ///
-    /// This can be used to change the certificate used at runtime.
-    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
-        *self.project().tls = acceptor;
-    }
-}
-
-impl<A, T> Stream for TlsListener<A, T>
-where
-    A: AsyncAccept,
-    A::Error: std::error::Error,
-    T: AsyncTls<A::Connection>,
-{
-    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
        let mut this = self.project();

        loop {
            match this.listener.as_mut().poll_accept(cx) {
                Poll::Pending => break,
-                Poll::Ready(Some(Ok(conn))) => {
-                    this.waiting
-                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                Poll::Ready(Some(Ok(mut conn))) => {
+                    let t = *this.timeout;
+                    let tls = this.tls.clone();
+                    let span = conn.span.clone();
+                    this.waiting.spawn(async move {
+                        let peer_addr = match conn.inner.wait_for_addr().await {
+                            Ok(Some(addr)) => addr,
+                            Err(e) => {
+                                tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+                                return None;
+                            }
+                            Ok(None) => conn.inner.inner.remote_addr()
+                        };
+
+                        let accept = tls.accept(conn);
+                        match timeout(t, accept).await {
+                            Ok(Ok(conn)) => {
+                                info!(%peer_addr, "accepted new TLS connection");
+                                Some(conn)
+                            },
+                            // The handshake failed, try getting another connection from the queue
+                            Ok(Err(e)) => {
+                                TLS_HANDSHAKE_FAILURES.inc();
+                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
+                                None
+                            }
+                            // The handshake timed out, try getting another connection from the queue
+                            Err(_) => {
+                                TLS_HANDSHAKE_FAILURES.inc();
+                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
+                                None
+                            }
+                        }
+                    }.instrument(span));
                }
                Poll::Ready(Some(Err(e))) => {
-                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                    tracing::error!("error accepting TCP connection: {e}");
+                    continue;
                }
                Poll::Ready(None) => return Poll::Ready(None),
            }
@@ -188,96 +108,16 @@ where

        loop {
            return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Ok(conn)))) => {
-                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
+                // The handshake failed to complete, try getting another connection from the queue
+                Poll::Ready(Some(Ok(None))) => continue,
+                // The handshake panicked or was cancelled. ignore and get another connection
+                Poll::Ready(Some(Err(e))) => {
+                    tracing::warn!("handshake aborted: {e}");
+                    continue;
                }
-                // The handshake timed out, try getting another connection from the queue
-                Poll::Ready(Some(Ok(Err(_)))) => continue,
-                // The handshake panicked
-                Poll::Ready(Some(Err(e))) if e.is_panic() => {
-                    std::panic::resume_unwind(e.into_panic())
-                }
-                // The handshake was externally aborted
-                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
                _ => Poll::Pending,
            };
        }
    }
 }
-
-impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
-    type Stream = tokio_rustls::server::TlsStream<C>;
-    type Error = std::io::Error;
-    type AcceptFuture = tokio_rustls::Accept<C>;
-
-    fn accept(&self, conn: C) -> Self::AcceptFuture {
-        tokio_rustls::TlsAcceptor::accept(self, conn)
-    }
-}
-
-impl<T> Builder<T> {
-    /// Set the timeout for handshakes.
-    ///
-    /// If a timeout takes longer than `timeout`, then the handshake will be
-    /// aborted and the underlying connection will be dropped.
-    ///
-    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
-    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
-        self.handshake_timeout = timeout;
-        self
-    }
-
-    /// Create a `TlsListener` from the builder
-    ///
-    /// Actually build the `TlsListener`. The `listener` argument should be
-    /// an implementation of the `AsyncAccept` trait that accepts new connections
-    /// that the `TlsListener` will  encrypt using TLS.
-    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
-    where
-        T: AsyncTls<A::Connection>,
-    {
-        TlsListener {
-            listener,
-            tls: self.tls.clone(),
-            waiting: JoinSet::new(),
-            timeout: self.handshake_timeout,
-        }
-    }
-}
-
-/// Create a new Builder for a TlsListener
-///
-/// `server_config` will be used to configure the TLS sessions.
-pub fn builder<T>(tls: T) -> Builder<T> {
-    Builder {
-        tls,
-        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
-    }
-}
-
-pin_project! {
-    /// See [`AsyncAccept::until`]
-    pub struct Until<A, E> {
-        #[pin]
-        acceptor: A,
-        #[pin]
-        ender: E,
-    }
-}
-
-impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
-    type Connection = A::Connection;
-    type Error = A::Error;
-
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
-        let this = self.project();
-
-        match this.ender.poll(cx) {
-            Poll::Pending => this.acceptor.poll_accept(cx),
-            Poll::Ready(_) => Poll::Ready(None),
-        }
-    }
-}
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -140,6 +140,13 @@ pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
 });
+pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_received_ps_feedbacks_total",
+        "Number of pageserver feedbacks received"
+    )
+    .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
+});

 pub const LABEL_UNKNOWN: &str = "unknown";

@@ -301,7 +308,8 @@ pub async fn time_io_closure<E: Into<anyhow::Error>>(
 #[derive(Clone)]
 pub struct FullTimelineInfo {
    pub ttid: TenantTimelineId,
-    pub ps_feedback: PageserverFeedback,
+    pub ps_feedback_count: u64,
+    pub last_ps_feedback: PageserverFeedback,
    pub wal_backup_active: bool,
    pub timeline_is_active: bool,
    pub num_computes: u32,
@@ -327,6 +335,7 @@ pub struct TimelineCollector {
    remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
    ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
    feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
+    ps_feedback_count: GenericGaugeVec<AtomicU64>,
    timeline_active: GenericGaugeVec<AtomicU64>,
    wal_backup_active: GenericGaugeVec<AtomicU64>,
    connected_computes: IntGaugeVec,
@@ -430,6 +439,15 @@ impl TimelineCollector {
        .unwrap();
        descs.extend(feedback_last_time_seconds.desc().into_iter().cloned());

+        let ps_feedback_count = GenericGaugeVec::new(
+            Opts::new(
+                "safekeeper_ps_feedback_count_total",
+                "Number of feedbacks received from the pageserver",
+            ),
+            &["tenant_id", "timeline_id"],
+        )
+        .unwrap();
+
        let timeline_active = GenericGaugeVec::new(
            Opts::new(
                "safekeeper_timeline_active",
@@ -538,6 +556,7 @@ impl TimelineCollector {
            remote_consistent_lsn,
            ps_last_received_lsn,
            feedback_last_time_seconds,
+            ps_feedback_count,
            timeline_active,
            wal_backup_active,
            connected_computes,
@@ -570,6 +589,7 @@ impl Collector for TimelineCollector {
        self.remote_consistent_lsn.reset();
        self.ps_last_received_lsn.reset();
        self.feedback_last_time_seconds.reset();
+        self.ps_feedback_count.reset();
        self.timeline_active.reset();
        self.wal_backup_active.reset();
        self.connected_computes.reset();
@@ -646,9 +666,12 @@ impl Collector for TimelineCollector {

            self.ps_last_received_lsn
                .with_label_values(labels)
-                .set(tli.ps_feedback.last_received_lsn.0);
+                .set(tli.last_ps_feedback.last_received_lsn.0);
+            self.ps_feedback_count
+                .with_label_values(labels)
+                .set(tli.ps_feedback_count);
            if let Ok(unix_time) = tli
-                .ps_feedback
+                .last_ps_feedback
                .replytime
                .duration_since(SystemTime::UNIX_EPOCH)
            {
@@ -679,6 +702,7 @@ impl Collector for TimelineCollector {
        mfs.extend(self.remote_consistent_lsn.collect());
        mfs.extend(self.ps_last_received_lsn.collect());
        mfs.extend(self.feedback_last_time_seconds.collect());
+        mfs.extend(self.ps_feedback_count.collect());
        mfs.extend(self.timeline_active.collect());
        mfs.extend(self.wal_backup_active.collect());
        mfs.extend(self.connected_computes.collect());
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -36,11 +36,15 @@ use tokio::time::Instant;
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+
+const DEFAULT_FEEDBACK_CAPACITY: usize = 8;

 /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
 /// in Arc).
 pub struct WalReceivers {
    mutex: Mutex<WalReceiversShared>,
+    pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
 }

 /// Id under which walreceiver is registered in shmem.
@@ -48,8 +52,12 @@ type WalReceiverId = usize;

 impl WalReceivers {
    pub fn new() -> Arc<WalReceivers> {
+        let (pageserver_feedback_tx, _) =
+            tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
+
        Arc::new(WalReceivers {
            mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
+            pageserver_feedback_tx,
        })
    }

@@ -116,6 +124,12 @@ impl WalReceivers {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
    }
+
+    /// Broadcast pageserver feedback to connected walproposers.
+    pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) {
+        // Err means there is no subscribers, it is fine.
+        let _ = self.pageserver_feedback_tx.send(feedback);
+    }
 }

 /// Only a few connections are expected (normally one), so store in Vec.
@@ -197,17 +211,28 @@ impl SafekeeperPostgresHandler {
        // sends, so this avoids deadlocks.
        let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
        let peer_addr = *pgb.get_peer_addr();
-        let network_reader = NetworkReader {
+        let mut network_reader = NetworkReader {
            ttid: self.ttid,
            conn_id: self.conn_id,
            pgb_reader: &mut pgb_reader,
            peer_addr,
            acceptor_handle: &mut acceptor_handle,
        };
-        let res = tokio::select! {
-            // todo: add read|write .context to these errors
-            r = network_reader.run(msg_tx, msg_rx, reply_tx) => r,
-            r = network_write(pgb, reply_rx) => r,
+
+        // Read first message and create timeline if needed.
+        let res = network_reader.read_first_message().await;
+
+        let res = if let Ok((tli, next_msg)) = res {
+            let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
+                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+
+            tokio::select! {
+                // todo: add read|write .context to these errors
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+            }
+        } else {
+            res.map(|_| ())
        };

        // Join pg backend back.
@@ -251,12 +276,9 @@ struct NetworkReader<'a, IO> {
 }

 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
-    async fn run(
-        self,
-        msg_tx: Sender<ProposerAcceptorMessage>,
-        msg_rx: Receiver<ProposerAcceptorMessage>,
-        reply_tx: Sender<AcceptorProposerMessage>,
-    ) -> Result<(), CopyStreamHandlerEnd> {
+    async fn read_first_message(
+        &mut self,
+    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
        // Receive information about server to create timeline, if not yet.
        let next_msg = read_message(self.pgb_reader).await?;
        let tli = match next_msg {
@@ -278,9 +300,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                )))
            }
        };
+        Ok((tli, next_msg))
+    }

+    async fn run(
+        self,
+        msg_tx: Sender<ProposerAcceptorMessage>,
+        msg_rx: Receiver<ProposerAcceptorMessage>,
+        reply_tx: Sender<AcceptorProposerMessage>,
+        tli: Arc<Timeline>,
+        next_msg: ProposerAcceptorMessage,
+    ) -> Result<(), CopyStreamHandlerEnd> {
        *self.acceptor_handle = Some(WalAcceptor::spawn(
-            tli.clone(),
+            tli,
            msg_rx,
            reply_tx,
            Some(self.conn_id),
@@ -320,18 +352,46 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
    pgb_writer: &mut PostgresBackend<IO>,
    mut reply_rx: Receiver<AcceptorProposerMessage>,
+    mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
 ) -> Result<(), CopyStreamHandlerEnd> {
    let mut buf = BytesMut::with_capacity(128);

+    // storing append_response to inject PageserverFeedback into it
+    let mut last_append_response = None;
+
    loop {
-        match reply_rx.recv().await {
-            Some(msg) => {
-                buf.clear();
-                msg.serialize(&mut buf)?;
-                pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
+        // trying to read either AcceptorProposerMessage or PageserverFeedback
+        let msg = tokio::select! {
+            reply = reply_rx.recv() => {
+                if let Some(msg) = reply {
+                    if let AcceptorProposerMessage::AppendResponse(append_response) = &msg {
+                        last_append_response = Some(append_response.clone());
+                    }
+                    Some(msg)
+                } else {
+                    return Ok(()); // chan closed, WalAcceptor terminated
+                }
            }
-            None => return Ok(()), // chan closed, WalAcceptor terminated
-        }
+
+            feedback = pageserver_feedback_rx.recv() =>
+                match (feedback, &last_append_response) {
+                    (Ok(feedback), Some(append_response)) => {
+                        // clone AppendResponse and inject PageserverFeedback into it
+                        let mut append_response = append_response.clone();
+                        append_response.pageserver_feedback = Some(feedback);
+                        Some(AcceptorProposerMessage::AppendResponse(append_response))
+                    }
+                    _ => None,
+                }
+        };
+
+        let Some(msg) = msg else {
+            continue;
+        };
+
+        buf.clear();
+        msg.serialize(&mut buf)?;
+        pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
    }
 }

--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -321,7 +321,7 @@ pub struct AppendRequestHeader {
 }

 /// Report safekeeper state to proposer
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Clone)]
 pub struct AppendResponse {
    // Current term of the safekeeper; if it is higher than proposer's, the
    // compute is out of date.
@@ -334,7 +334,7 @@ pub struct AppendResponse {
    // a criterion for walproposer --sync mode exit
    pub commit_lsn: Lsn,
    pub hs_feedback: HotStandbyFeedback,
-    pub pageserver_feedback: PageserverFeedback,
+    pub pageserver_feedback: Option<PageserverFeedback>,
 }

 impl AppendResponse {
@@ -344,7 +344,7 @@ impl AppendResponse {
            flush_lsn: Lsn(0),
            commit_lsn: Lsn(0),
            hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: PageserverFeedback::empty(),
+            pageserver_feedback: None,
        }
    }
 }
@@ -462,7 +462,11 @@ impl AcceptorProposerMessage {
                buf.put_u64_le(msg.hs_feedback.xmin);
                buf.put_u64_le(msg.hs_feedback.catalog_xmin);

-                msg.pageserver_feedback.serialize(buf);
+                // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                // if it is not present.
+                if let Some(ref msg) = msg.pageserver_feedback {
+                    msg.serialize(buf);
+                }
            }
        }

@@ -681,7 +685,7 @@ where
            commit_lsn: self.state.commit_lsn,
            // will be filled by the upper code to avoid bothering safekeeper
            hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: PageserverFeedback::empty(),
+            pageserver_feedback: None,
        };
        trace!("formed AppendResponse {:?}", ar);
        ar
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,8 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.

 use crate::handler::SafekeeperPostgresHandler;
+use crate::metrics::RECEIVED_PS_FEEDBACKS;
+use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
@@ -21,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;

-use std::cmp::{max, min};
+use std::cmp::min;
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -90,12 +92,14 @@ pub struct StandbyFeedback {
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
 pub struct WalSenders {
    mutex: Mutex<WalSendersShared>,
+    walreceivers: Arc<WalReceivers>,
 }

 impl WalSenders {
-    pub fn new() -> Arc<WalSenders> {
+    pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
        Arc::new(WalSenders {
            mutex: Mutex::new(WalSendersShared::new()),
+            walreceivers,
        })
    }

@@ -151,22 +155,29 @@ impl WalSenders {
            .min()
    }

-    /// Get aggregated pageserver feedback.
-    pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
-        self.mutex.lock().agg_ps_feedback
+    /// Returns total counter of pageserver feedbacks received and last feedback.
+    pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
+        let shared = self.mutex.lock();
+        (shared.ps_feedback_counter, shared.last_ps_feedback)
    }

-    /// Get aggregated pageserver and hot standby feedback (we send them to compute).
-    pub fn get_feedbacks(self: &Arc<WalSenders>) -> (PageserverFeedback, HotStandbyFeedback) {
-        let shared = self.mutex.lock();
-        (shared.agg_ps_feedback, shared.agg_hs_feedback)
+    /// Get aggregated hot standby feedback (we send it to compute).
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
+        self.mutex.lock().agg_hs_feedback
    }

    /// Record new pageserver feedback, update aggregated values.
    fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
        let mut shared = self.mutex.lock();
        shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
-        shared.update_ps_feedback();
+        shared.last_ps_feedback = *feedback;
+        shared.ps_feedback_counter += 1;
+        drop(shared);
+
+        RECEIVED_PS_FEEDBACKS.inc();
+
+        // send feedback to connected walproposers
+        self.walreceivers.broadcast_pageserver_feedback(*feedback);
    }

    /// Record standby reply.
@@ -222,8 +233,10 @@ impl WalSenders {
 struct WalSendersShared {
    // aggregated over all walsenders value
    agg_hs_feedback: HotStandbyFeedback,
-    // aggregated over all walsenders value
-    agg_ps_feedback: PageserverFeedback,
+    // last feedback ever received from any pageserver, empty if none
+    last_ps_feedback: PageserverFeedback,
+    // total counter of pageserver feedbacks received
+    ps_feedback_counter: u64,
    slots: Vec<Option<WalSenderState>>,
 }

@@ -231,7 +244,8 @@ impl WalSendersShared {
    fn new() -> Self {
        WalSendersShared {
            agg_hs_feedback: HotStandbyFeedback::empty(),
-            agg_ps_feedback: PageserverFeedback::empty(),
+            last_ps_feedback: PageserverFeedback::empty(),
+            ps_feedback_counter: 0,
            slots: Vec::new(),
        }
    }
@@ -276,37 +290,6 @@ impl WalSendersShared {
        }
        self.agg_hs_feedback = agg;
    }
-
-    /// Update aggregated pageserver feedback. LSNs (last_received,
-    /// disk_consistent, remote_consistent) and reply timestamp are just
-    /// maximized; timeline_size if taken from feedback with highest
-    /// last_received lsn. This is generally reasonable, but we might want to
-    /// implement other policies once multiple pageservers start to be actively
-    /// used.
-    fn update_ps_feedback(&mut self) {
-        let init = PageserverFeedback::empty();
-        let acc =
-            self.slots
-                .iter()
-                .flatten()
-                .fold(init, |mut acc, ws_state| match ws_state.feedback {
-                    ReplicationFeedback::Pageserver(feedback) => {
-                        if feedback.last_received_lsn > acc.last_received_lsn {
-                            acc.current_timeline_size = feedback.current_timeline_size;
-                        }
-                        acc.last_received_lsn =
-                            max(feedback.last_received_lsn, acc.last_received_lsn);
-                        acc.disk_consistent_lsn =
-                            max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn);
-                        acc.remote_consistent_lsn =
-                            max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn);
-                        acc.replytime = max(feedback.replytime, acc.replytime);
-                        acc
-                    }
-                    ReplicationFeedback::Standby(_) => acc,
-                });
-        self.agg_ps_feedback = acc;
-    }
 }

 // Serialized is used only for pretty printing in json.
@@ -443,7 +426,7 @@ impl SafekeeperPostgresHandler {
        };
        let mut reply_reader = ReplyReader {
            reader,
-            ws_guard,
+            ws_guard: ws_guard.clone(),
            tli,
        };

@@ -452,6 +435,18 @@ impl SafekeeperPostgresHandler {
            r = sender.run() => r,
            r = reply_reader.run() => r,
        };
+
+        let ws_state = ws_guard
+            .walsenders
+            .mutex
+            .lock()
+            .get_slot(ws_guard.id)
+            .clone();
+        info!(
+            "finished streaming to {}, feedback={:?}",
+            ws_state.addr, ws_state.feedback,
+        );
+
        // Join pg backend back.
        pgb.unsplit(reply_reader.reader)?;

@@ -733,7 +728,6 @@ async fn wait_for_lsn(

 #[cfg(test)]
 mod tests {
-    use postgres_protocol::PG_EPOCH;
    use utils::id::{TenantId, TimelineId};

    use super::*;
@@ -792,27 +786,4 @@ mod tests {
        wss.update_hs_feedback();
        assert_eq!(wss.agg_hs_feedback.xmin, 42);
    }
-
-    // form pageserver feedback with given last_record_lsn / tli size and the
-    // rest set to dummy values.
-    fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback {
-        ReplicationFeedback::Pageserver(PageserverFeedback {
-            current_timeline_size,
-            last_received_lsn,
-            disk_consistent_lsn: Lsn::INVALID,
-            remote_consistent_lsn: Lsn::INVALID,
-            replytime: *PG_EPOCH,
-        })
-    }
-
-    // test that ps aggregation works as expected
-    #[test]
-    fn test_ps_feedback() {
-        let mut wss = WalSendersShared::new();
-        push_feedback(&mut wss, ps_feedback(8, Lsn(42)));
-        push_feedback(&mut wss, ps_feedback(4, Lsn(84)));
-        wss.update_ps_feedback();
-        assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4);
-        assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84));
-    }
 }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -402,6 +402,7 @@ impl Timeline {
        )));
        let (cancellation_tx, cancellation_rx) = watch::channel(false);

+        let walreceivers = WalReceivers::new();
        Ok(Timeline {
            ttid,
            wal_backup_launcher_tx,
@@ -410,8 +411,8 @@ impl Timeline {
            term_flush_lsn_watch_tx,
            term_flush_lsn_watch_rx,
            mutex: Mutex::new(shared_state),
-            walsenders: WalSenders::new(),
-            walreceivers: WalReceivers::new(),
+            walsenders: WalSenders::new(walreceivers.clone()),
+            walreceivers,
            cancellation_rx,
            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
@@ -435,6 +436,7 @@ impl Timeline {
        let state =
            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);

+        let walreceivers = WalReceivers::new();
        Ok(Timeline {
            ttid,
            wal_backup_launcher_tx,
@@ -443,8 +445,8 @@ impl Timeline {
            term_flush_lsn_watch_tx,
            term_flush_lsn_watch_rx,
            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
-            walsenders: WalSenders::new(),
-            walreceivers: WalReceivers::new(),
+            walsenders: WalSenders::new(walreceivers.clone()),
+            walreceivers,
            cancellation_rx,
            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
@@ -656,12 +658,9 @@ impl Timeline {
            let mut shared_state = self.write_shared_state().await;
            rmsg = shared_state.sk.process_msg(msg).await?;

-            // if this is AppendResponse, fill in proper pageserver and hot
-            // standby feedback.
+            // if this is AppendResponse, fill in proper hot standby feedback.
            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks();
-                resp.hs_feedback = hs_feedback;
-                resp.pageserver_feedback = ps_feedback;
+                resp.hs_feedback = self.walsenders.get_hotstandby();
            }

            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
@@ -898,12 +897,13 @@ impl Timeline {
            return None;
        }

-        let ps_feedback = self.walsenders.get_ps_feedback();
+        let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
        let state = self.write_shared_state().await;
        if state.active {
            Some(FullTimelineInfo {
                ttid: self.ttid,
-                ps_feedback,
+                ps_feedback_count,
+                last_ps_feedback,
                wal_backup_active: state.wal_backup_active,
                timeline_is_active: state.active,
                num_computes: self.walreceivers.get_num() as u32,
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -196,6 +196,7 @@ pub struct SimulationApi {
    safekeepers: RefCell<Vec<SafekeeperConn>>,
    disk: Arc<DiskWalProposer>,
    redo_start_lsn: Option<Lsn>,
+    last_logged_commit_lsn: u64,
    shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
    config: Config,
    event_set: RefCell<Option<EventSet>>,
@@ -228,6 +229,7 @@ impl SimulationApi {
            safekeepers: RefCell::new(sk_conns),
            disk: args.disk,
            redo_start_lsn: args.redo_start_lsn,
+            last_logged_commit_lsn: 0,
            shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
                mutex: 0,
                feedback: PageserverFeedback {
@@ -596,14 +598,11 @@ impl ApiImpl for SimulationApi {
        }
    }

-    fn process_safekeeper_feedback(
-        &self,
-        wp: &mut walproposer::bindings::WalProposer,
-        commit_lsn: u64,
-    ) {
-        debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
-        if commit_lsn > wp.lastSentCommitLsn {
-            self.os.log_event(format!("commit_lsn;{}", commit_lsn));
+    fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
+        debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
+        if wp.commitLsn > self.last_logged_commit_lsn {
+            self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
+            self.last_logged_commit_lsn = wp.commitLsn;
        }
    }

--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -15,7 +15,8 @@ FLAKY_TESTS_QUERY = """
        DISTINCT parent_suite, suite, name
    FROM results
    WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '10' day
+        AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
        AND (
            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
            OR flaky
@@ -46,11 +47,14 @@ def main(args: argparse.Namespace):
        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
        rows = []

-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
+    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
    # use it to parametrize test name along with build_type and pg_version
    #
    # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
+        "",
+        "tokio-epoll-uring",
+    ):
        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
    else:
        pageserver_virtual_file_io_engine_parameter = ""
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -2,6 +2,7 @@ pytest_plugins = (
    "fixtures.pg_version",
    "fixtures.parametrize",
    "fixtures.httpserver",
+    "fixtures.compute_reconfigure",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -0,0 +1,62 @@
+import concurrent.futures
+from typing import Any
+
+import pytest
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+from fixtures.types import TenantId
+
+
+class ComputeReconfigure:
+    def __init__(self, server):
+        self.server = server
+        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
+        self.workloads = {}
+
+    def register_workload(self, workload):
+        self.workloads[workload.tenant_id] = workload
+
+
+@pytest.fixture(scope="function")
+def compute_reconfigure_listener(make_httpserver):
+    """
+    This fixture exposes an HTTP listener for the storage controller to submit
+    compute notifications to us, instead of updating neon_local endpoints itself.
+
+    Although storage controller can use neon_local directly, this causes problems when
+    the test is also concurrently modifying endpoints.  Instead, configure storage controller
+    to send notifications up to this test code, which will route all endpoint updates
+    through Workload, which has a mutex to make concurrent updates safe.
+    """
+    server = make_httpserver
+
+    self = ComputeReconfigure(server)
+
+    # Do neon_local endpoint reconfiguration in the background so that we can
+    # accept a healthy rate of calls into notify-attach.
+    reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+    def handler(request: Request):
+        assert request.json is not None
+        body: dict[str, Any] = request.json
+        log.info(f"notify-attach request: {body}")
+
+        try:
+            workload = self.workloads[TenantId(body["tenant_id"])]
+        except KeyError:
+            pass
+        else:
+            # This causes the endpoint to query storage controller for its location, which
+            # is redundant since we already have it here, but this avoids extending the
+            # neon_local CLI to take full lists of locations
+            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[no-any-return]
+
+        return Response(status=200)
+
+    self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
+
+    yield self
+    reconfigure_threads.shutdown()
+    server.clear()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -15,11 +15,11 @@ import threading
 import time
 import uuid
 from contextlib import closing, contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
-from functools import cached_property
+from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
@@ -70,6 +70,8 @@ from fixtures.remote_storage import (
    default_remote_storage,
    remote_storage_to_toml_inline_table,
 )
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.utils import are_walreceivers_absent
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
@@ -517,9 +519,9 @@ class NeonEnvBuilder:
        self.env = NeonEnv(self)
        return self.env

-    def start(self):
+    def start(self, register_pageservers=False):
        assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start()
+        self.env.start(register_pageservers=register_pageservers)

    def init_start(
        self,
@@ -1012,24 +1014,24 @@ class NeonEnv:
        self.initial_tenant = config.initial_tenant
        self.initial_timeline = config.initial_timeline

-        # Find two adjacent ports for attachment service and its postgres DB.  This
+        # Find two adjacent ports for storage controller and its postgres DB.  This
        # loop would eventually throw from get_port() if we run out of ports (extremely
        # unlikely): usually we find two adjacent free ports on the first iteration.
        while True:
-            self.attachment_service_port = self.port_distributor.get_port()
-            attachment_service_pg_port = self.port_distributor.get_port()
-            if attachment_service_pg_port == self.attachment_service_port + 1:
+            self.storage_controller_port = self.port_distributor.get_port()
+            storage_controller_pg_port = self.port_distributor.get_port()
+            if storage_controller_pg_port == self.storage_controller_port + 1:
                break

        # The URL for the pageserver to use as its control_plane_api config
-        self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
-        # The base URL of the attachment service
-        self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
+        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
+        # The base URL of the storage controller
+        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"

        # For testing this with a fake HTTP server, enable passing through a URL from config
        self.control_plane_compute_hook_api = config.control_plane_compute_hook_api

-        self.attachment_service: NeonAttachmentService = NeonAttachmentService(
+        self.storage_controller: NeonStorageController = NeonStorageController(
            self, config.auth_enabled
        )

@@ -1110,17 +1112,22 @@ class NeonEnv:
        log.info(f"Config: {cfg}")
        self.neon_cli.init(cfg, force=config.config_init_force)

-    def start(self):
-        # Attachment service starts first, so that pageserver /re-attach calls don't
+    def start(self, register_pageservers=False):
+        # storage controller starts first, so that pageserver /re-attach calls don't
        # bounce through retries on startup
-        self.attachment_service.start()
+        self.storage_controller.start()

-        def attachment_service_ready():
-            assert self.attachment_service.ready() is True
+        def storage_controller_ready():
+            assert self.storage_controller.ready() is True

-        # Wait for attachment service readiness to prevent unnecessary post start-up
+        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
-        wait_until(30, 1, attachment_service_ready)
+        wait_until(30, 1, storage_controller_ready)
+
+        if register_pageservers:
+            # Special case for forward compat tests, this can be removed later.
+            for pageserver in self.pageservers:
+                self.storage_controller.node_register(pageserver)

        # Start up broker, pageserver and all safekeepers
        futs = []
@@ -1151,7 +1158,7 @@ class NeonEnv:
            if ps_assert_metric_no_errors:
                pageserver.assert_no_metric_errors()
            pageserver.stop(immediate=immediate)
-        self.attachment_service.stop(immediate=immediate)
+        self.storage_controller.stop(immediate=immediate)
        self.broker.stop(immediate=immediate)

    @property
@@ -1186,9 +1193,9 @@ class NeonEnv:
    def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
        """
        Get the NeonPageserver where this tenant shard is currently attached, according
-        to the attachment service.
+        to the storage controller.
        """
-        meta = self.attachment_service.inspect(tenant_id)
+        meta = self.storage_controller.inspect(tenant_id)
        if meta is None:
            return None
        pageserver_id = meta[1]
@@ -1695,12 +1702,12 @@ class NeonCli(AbstractNeonCli):
            res.check_returncode()
            return res

-    def attachment_service_start(self):
-        cmd = ["attachment_service", "start"]
+    def storage_controller_start(self):
+        cmd = ["storage_controller", "start"]
        return self.raw_cli(cmd)

-    def attachment_service_stop(self, immediate: bool):
-        cmd = ["attachment_service", "stop"]
+    def storage_controller_stop(self, immediate: bool):
+        cmd = ["storage_controller", "stop"]
        if immediate:
            cmd.extend(["-m", "immediate"])
        return self.raw_cli(cmd)
@@ -1710,10 +1717,8 @@ class NeonCli(AbstractNeonCli):
        id: int,
        overrides: Tuple[str, ...] = (),
        extra_env_vars: Optional[Dict[str, str]] = None,
-        register: bool = True,
    ) -> "subprocess.CompletedProcess[str]":
-        register_str = "true" if register else "false"
-        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
+        start_args = ["pageserver", "start", f"--id={id}", *overrides]
        storage = self.env.pageserver_remote_storage
        append_pageserver_param_overrides(
            params_to_update=start_args,
@@ -1940,14 +1945,14 @@ class Pagectl(AbstractNeonCli):
        return IndexPartDump.from_json(parsed)


-class AttachmentServiceApiException(Exception):
+class StorageControllerApiException(Exception):
    def __init__(self, message, status_code: int):
        super().__init__(message)
        self.message = message
        self.status_code = status_code


-class NeonAttachmentService(MetricsGetter):
+class NeonStorageController(MetricsGetter):
    def __init__(self, env: NeonEnv, auth_enabled: bool):
        self.env = env
        self.running = False
@@ -1955,13 +1960,13 @@ class NeonAttachmentService(MetricsGetter):

    def start(self):
        assert not self.running
-        self.env.neon_cli.attachment_service_start()
+        self.env.neon_cli.storage_controller_start()
        self.running = True
        return self

-    def stop(self, immediate: bool = False) -> "NeonAttachmentService":
+    def stop(self, immediate: bool = False) -> "NeonStorageController":
        if self.running:
-            self.env.neon_cli.attachment_service_stop(immediate)
+            self.env.neon_cli.storage_controller_stop(immediate)
            self.running = False
        return self

@@ -1974,22 +1979,22 @@ class NeonAttachmentService(MetricsGetter):
                msg = res.json()["msg"]
            except:  # noqa: E722
                msg = ""
-            raise AttachmentServiceApiException(msg, res.status_code) from e
+            raise StorageControllerApiException(msg, res.status_code) from e

    def pageserver_api(self) -> PageserverHttpClient:
        """
-        The attachment service implements a subset of the pageserver REST API, for mapping
+        The storage controller implements a subset of the pageserver REST API, for mapping
        per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
        functions via the HttpClient, as an implicit check that these APIs remain compatible.
        """
        auth_token = None
        if self.auth_enabled:
            auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)
+        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)

    def request(self, method, *args, **kwargs) -> requests.Response:
        resp = requests.request(method, *args, **kwargs)
-        NeonAttachmentService.raise_api_exception(resp)
+        NeonStorageController.raise_api_exception(resp)

        return resp

@@ -2002,15 +2007,15 @@ class NeonAttachmentService(MetricsGetter):
        return headers

    def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
+        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
        return parse_metrics(res.text)

    def ready(self) -> bool:
        status = None
        try:
-            resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
            status = resp.status_code
-        except AttachmentServiceApiException as e:
+        except StorageControllerApiException as e:
            status = e.status_code

        if status == 503:
@@ -2025,7 +2030,7 @@ class NeonAttachmentService(MetricsGetter):
    ) -> int:
        response = self.request(
            "POST",
-            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2036,7 +2041,7 @@ class NeonAttachmentService(MetricsGetter):
    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
        self.request(
            "POST",
-            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2047,7 +2052,7 @@ class NeonAttachmentService(MetricsGetter):
        """
        response = self.request(
            "POST",
-            f"{self.env.attachment_service_api}/debug/v1/inspect",
+            f"{self.env.storage_controller_api}/debug/v1/inspect",
            json={"tenant_shard_id": str(tenant_shard_id)},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2064,11 +2069,13 @@ class NeonAttachmentService(MetricsGetter):
            "node_id": int(node.id),
            "listen_http_addr": "localhost",
            "listen_http_port": node.service_port.http,
+            "listen_pg_addr": "localhost",
+            "listen_pg_port": node.service_port.pg,
        }
        log.info(f"node_register({body})")
        self.request(
            "POST",
-            f"{self.env.attachment_service_api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2076,7 +2083,7 @@ class NeonAttachmentService(MetricsGetter):
    def node_list(self):
        response = self.request(
            "GET",
-            f"{self.env.attachment_service_api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
@@ -2086,7 +2093,7 @@ class NeonAttachmentService(MetricsGetter):
        body["node_id"] = node_id
        self.request(
            "PUT",
-            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2116,7 +2123,7 @@ class NeonAttachmentService(MetricsGetter):

        response = self.request(
            "POST",
-            f"{self.env.attachment_service_api}/v1/tenant",
+            f"{self.env.storage_controller_api}/v1/tenant",
            json=body,
            headers=self.headers(TokenScope.PAGE_SERVER_API),
        )
@@ -2128,18 +2135,20 @@ class NeonAttachmentService(MetricsGetter):
        """
        response = self.request(
            "GET",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
            headers=self.headers(TokenScope.ADMIN),
        )
        body = response.json()
        shards: list[dict[str, Any]] = body["shards"]
        return shards

-    def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
+    def tenant_shard_split(
+        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
+    ) -> list[TenantShardId]:
        response = self.request(
            "PUT",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
-            json={"new_shard_count": shard_count},
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
+            json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
            headers=self.headers(TokenScope.ADMIN),
        )
        body = response.json()
@@ -2150,7 +2159,7 @@ class NeonAttachmentService(MetricsGetter):
    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
        self.request(
            "PUT",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2163,12 +2172,29 @@ class NeonAttachmentService(MetricsGetter):
        """
        self.request(
            "POST",
-            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
            headers=self.headers(TokenScope.ADMIN),
        )
-        log.info("Attachment service passed consistency check")
+        log.info("storage controller passed consistency check")

-    def __enter__(self) -> "NeonAttachmentService":
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+
+    def __enter__(self) -> "NeonStorageController":
        return self

    def __exit__(
@@ -2231,7 +2257,6 @@ class NeonPageserver(PgProtocol):
        self,
        overrides: Tuple[str, ...] = (),
        extra_env_vars: Optional[Dict[str, str]] = None,
-        register: bool = True,
    ) -> "NeonPageserver":
        """
        Start the page server.
@@ -2241,7 +2266,7 @@ class NeonPageserver(PgProtocol):
        assert self.running is False

        self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars
        )
        self.running = True
        return self
@@ -2399,7 +2424,7 @@ class NeonPageserver(PgProtocol):
        """
        client = self.http_client()
        if generation is None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
        return client.tenant_attach(
            tenant_id,
            config,
@@ -2408,14 +2433,14 @@ class NeonPageserver(PgProtocol):
        )

    def tenant_detach(self, tenant_id: TenantId):
-        self.env.attachment_service.attach_hook_drop(tenant_id)
+        self.env.storage_controller.attach_hook_drop(tenant_id)

        client = self.http_client()
        return client.tenant_detach(tenant_id)

    def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
        if config["mode"].startswith("Attached") and "generation" not in config:
-            config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)

        client = self.http_client()
        return client.tenant_location_conf(tenant_id, config, **kwargs)
@@ -2439,14 +2464,14 @@ class NeonPageserver(PgProtocol):
        generation: Optional[int] = None,
    ) -> TenantId:
        if generation is None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
        client = self.http_client(auth_token=auth_token)
        return client.tenant_create(tenant_id, conf, generation=generation)

    def tenant_load(self, tenant_id: TenantId):
        client = self.http_client()
        return client.tenant_load(
-            tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
        )


@@ -2547,6 +2572,20 @@ class PgBin:
        )
        return base_path

+    def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
+        """
+        Run pg_controldata on given datadir and extract checkpoint lsn.
+        """
+
+        pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
+        cmd = f"{pg_controldata_path} -D {pgdata}"
+        result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
+        checkpoint_lsn = re.findall(
+            "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
+        )[0]
+        log.info(f"last checkpoint at {checkpoint_lsn}")
+        return Lsn(checkpoint_lsn)
+

@pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
@@ -2843,6 +2882,7 @@ class NeonProxy(PgProtocol):
        self.auth_backend = auth_backend
        self.metric_collection_endpoint = metric_collection_endpoint
        self.metric_collection_interval = metric_collection_interval
+        self.http_timeout_seconds = 15
        self._popen: Optional[subprocess.Popen[bytes]] = None

    def start(self) -> NeonProxy:
@@ -2881,6 +2921,7 @@ class NeonProxy(PgProtocol):
            *["--proxy", f"{self.host}:{self.proxy_port}"],
            *["--mgmt", f"{self.host}:{self.mgmt_port}"],
            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"],
            *["-c", str(crt_path)],
            *["-k", str(key_path)],
            *self.auth_backend.extra_args(),
@@ -2920,6 +2961,9 @@ class NeonProxy(PgProtocol):
        user = quote(kwargs["user"])
        password = quote(kwargs["password"])
        expected_code = kwargs.get("expected_code")
+        timeout = kwargs.get("timeout")
+
+        log.info(f"Executing http query: {query}")

        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
        response = requests.post(
@@ -2931,6 +2975,7 @@ class NeonProxy(PgProtocol):
                "Neon-Pool-Opt-In": "true",
            },
            verify=str(self.test_output_dir / "proxy.crt"),
+            timeout=timeout,
        )

        if expected_code is not None:
@@ -2943,6 +2988,8 @@ class NeonProxy(PgProtocol):
        password = kwargs["password"]
        expected_code = kwargs.get("expected_code")

+        log.info(f"Executing http2 query: {query}")
+
        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
        async with httpx.AsyncClient(
            http2=True, verify=str(self.test_output_dir / "proxy.crt")
@@ -3565,220 +3612,6 @@ class Safekeeper:
        return segments


-# Walreceiver as returned by sk's timeline status endpoint.
-@dataclass
-class Walreceiver:
-    conn_id: int
-    state: str
-
-
-@dataclass
-class SafekeeperTimelineStatus:
-    acceptor_epoch: int
-    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-    flush_lsn: Lsn
-    commit_lsn: Lsn
-    timeline_start_lsn: Lsn
-    backup_lsn: Lsn
-    peer_horizon_lsn: Lsn
-    remote_consistent_lsn: Lsn
-    walreceivers: List[Walreceiver]
-
-
-@dataclass
-class SafekeeperMetrics:
-    # These are metrics from Prometheus which uses float64 internally.
-    # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-
-
-class SafekeeperHttpClient(requests.Session):
-    HTTPError = requests.HTTPError
-
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
-        super().__init__()
-        self.port = port
-        self.auth_token = auth_token
-        self.is_testing_enabled = is_testing_enabled
-
-        if auth_token is not None:
-            self.headers["Authorization"] = f"Bearer {auth_token}"
-
-    def check_status(self):
-        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
-
-    def is_testing_enabled_or_skip(self):
-        if not self.is_testing_enabled:
-            pytest.skip("safekeeper was built without 'testing' feature")
-
-    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
-        self.is_testing_enabled_or_skip()
-
-        if isinstance(config_strings, tuple):
-            pairs = [config_strings]
-        else:
-            pairs = config_strings
-
-        log.info(f"Requesting config failpoints: {repr(pairs)}")
-
-        res = self.put(
-            f"http://localhost:{self.port}/v1/failpoints",
-            json=[{"name": name, "actions": actions} for name, actions in pairs],
-        )
-        log.info(f"Got failpoints request response code {res.status_code}")
-        res.raise_for_status()
-        res_json = res.json()
-        assert res_json is None
-        return res_json
-
-    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
-        params = params or {}
-        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
-        res.raise_for_status()
-        res_json = json.loads(res.text)
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def patch_control_file(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        patch: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        res = self.patch(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
-            json={
-                "updates": patch,
-                "apply_fields": list(patch.keys()),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
-        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def timeline_digest(
-        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
-    ) -> Dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
-            params={
-                "from_lsn": str(from_lsn),
-                "until_lsn": str(until_lsn),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-        commit_lsn: Lsn,
-    ):
-        body = {
-            "tenant_id": str(tenant_id),
-            "timeline_id": str(timeline_id),
-            "pg_version": pg_version,
-            "commit_lsn": str(commit_lsn),
-        }
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
-        res.raise_for_status()
-
-    def timeline_status(
-        self, tenant_id: TenantId, timeline_id: TimelineId
-    ) -> SafekeeperTimelineStatus:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
-        res.raise_for_status()
-        resj = res.json()
-        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
-        return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
-            pg_version=resj["pg_info"]["pg_version"],
-            flush_lsn=Lsn(resj["flush_lsn"]),
-            commit_lsn=Lsn(resj["commit_lsn"]),
-            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
-            backup_lsn=Lsn(resj["backup_lsn"]),
-            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
-            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
-            walreceivers=walreceivers,
-        )
-
-    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
-            json=body,
-        )
-        res.raise_for_status()
-
-    # only_local doesn't remove segments in the remote storage.
-    def timeline_delete(
-        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
-    ) -> Dict[Any, Any]:
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
-            params={
-                "only_local": str(only_local).lower(),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def get_metrics_str(self) -> str:
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
-        request_result.raise_for_status()
-        return request_result.text
-
-    def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
-
-
 class S3Scrubber:
    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
        self.env = env
@@ -4088,32 +3921,29 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:

 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+
    # Get the timeline ID. We need it for the 'basebackup' command
    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

-    # many tests already checkpoint, but do it just in case
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CHECKPOINT")
-
-    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
    # stop postgres to ensure that files won't change
    endpoint.stop()

+    # Read the shutdown checkpoint's LSN
+    checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path())
+
    # Take a basebackup from pageserver
    restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
    restored_dir_path.mkdir(exist_ok=True)

-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
    psql_path = os.path.join(pg_bin.pg_bin_path, "psql")

-    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
+    pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"]
    cmd = rf"""
        {psql_path}                                    \
            --no-psqlrc                                \
            postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
         | tar -x -C {restored_dir_path}
    """

@@ -4195,7 +4025,7 @@ def tenant_get_shards(
    us to figure out the shards for a tenant.

    If the caller provides `pageserver_id`, it will be used for all shards, even
-    if the shard is indicated by attachment service to be on some other pageserver.
+    if the shard is indicated by storage controller to be on some other pageserver.

    Caller should over the response to apply their per-pageserver action to
    each shard
@@ -4211,7 +4041,7 @@ def tenant_get_shards(
                TenantShardId.parse(s["shard_id"]),
                override_pageserver or env.get_pageserver(s["node_id"]),
            )
-            for s in env.attachment_service.locate(tenant_id)
+            for s in env.storage_controller.locate(tenant_id)
        ]
    else:
        # Assume an unsharded tenant
@@ -4262,6 +4092,49 @@ def wait_for_last_flush_lsn(
    return min(results)


+def flush_ep_to_pageserver(
+    env: NeonEnv,
+    ep: Endpoint,
+    tenant: TenantId,
+    timeline: TimelineId,
+    pageserver_id: Optional[int] = None,
+) -> Lsn:
+    """
+    Stop endpoint and wait until all committed WAL reaches the pageserver
+    (last_record_lsn). This is for use by tests which want everything written so
+    far to reach pageserver *and* expecting that no more data will arrive until
+    endpoint starts again, so unlike wait_for_last_flush_lsn it polls
+    safekeepers instead of compute to learn LSN.
+
+    Returns the catch up LSN.
+    """
+    ep.stop()
+
+    commit_lsn: Lsn = Lsn(0)
+    # In principle in the absense of failures polling single sk would be enough.
+    for sk in env.safekeepers:
+        cli = sk.http_client()
+        # wait until compute connections are gone
+        wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline))
+        commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn)
+
+    # Note: depending on WAL filtering implementation, probably most shards
+    # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this
+    # is broken in sharded case.
+    shards = tenant_get_shards(env, tenant, pageserver_id)
+    for tenant_shard_id, pageserver in shards:
+        log.info(
+            f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
+        )
+        waited = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, commit_lsn
+        )
+
+        assert waited >= commit_lsn
+
+    return commit_lsn
+
+
 def wait_for_wal_insert_lsn(
    env: NeonEnv,
    endpoint: Endpoint,
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -318,6 +318,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        assert isinstance(res_json["tenant_shards"], list)
        return res_json

+    def tenant_get_location(self, tenant_id: TenantShardId):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
+        )
+        self.verbose_error(res)
+        return res.json()
+
    def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -43,7 +43,7 @@ def single_timeline(
    log.info("detach template tenant form pageserver")
    env.pageserver.tenant_detach(template_tenant)
    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
        ".*Dropped remote consistent LSN updates.*",
    )

--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -46,9 +46,12 @@ def pytest_generate_tests(metafunc: Metafunc):

    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))

-    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
-    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs`
+    # And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
+        "",
+        "tokio-epoll-uring",
+    ):
        metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])

    # For performance tests, parametrize also by platform
--- a/test_runner/fixtures/safekeeper/init.py
+++ b/test_runner/fixtures/safekeeper/init.py
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -0,0 +1,227 @@
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytest
+import requests
+
+from fixtures.log_helper import log
+from fixtures.types import Lsn, TenantId, TimelineId
+
+
+# Walreceiver as returned by sk's timeline status endpoint.
+@dataclass
+class Walreceiver:
+    conn_id: int
+    state: str
+
+
+@dataclass
+class SafekeeperTimelineStatus:
+    acceptor_epoch: int
+    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+    flush_lsn: Lsn
+    commit_lsn: Lsn
+    timeline_start_lsn: Lsn
+    backup_lsn: Lsn
+    peer_horizon_lsn: Lsn
+    remote_consistent_lsn: Lsn
+    walreceivers: List[Walreceiver]
+
+
+@dataclass
+class SafekeeperMetrics:
+    # These are metrics from Prometheus which uses float64 internally.
+    # As a consequence, values may differ from real original int64s.
+    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+
+class SafekeeperHttpClient(requests.Session):
+    HTTPError = requests.HTTPError
+
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
+        super().__init__()
+        self.port = port
+        self.auth_token = auth_token
+        self.is_testing_enabled = is_testing_enabled
+
+        if auth_token is not None:
+            self.headers["Authorization"] = f"Bearer {auth_token}"
+
+    def check_status(self):
+        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
+
+    def is_testing_enabled_or_skip(self):
+        if not self.is_testing_enabled:
+            pytest.skip("safekeeper was built without 'testing' feature")
+
+    def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
+    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        params = params or {}
+        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
+        res.raise_for_status()
+        res_json = json.loads(res.text)
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_create(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+        commit_lsn: Lsn,
+    ):
+        body = {
+            "tenant_id": str(tenant_id),
+            "timeline_id": str(timeline_id),
+            "pg_version": pg_version,
+            "commit_lsn": str(commit_lsn),
+        }
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
+        res.raise_for_status()
+
+    def timeline_status(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> SafekeeperTimelineStatus:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
+        res.raise_for_status()
+        resj = res.json()
+        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
+        return SafekeeperTimelineStatus(
+            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            pg_version=resj["pg_info"]["pg_version"],
+            flush_lsn=Lsn(resj["flush_lsn"]),
+            commit_lsn=Lsn(resj["commit_lsn"]),
+            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
+            backup_lsn=Lsn(resj["backup_lsn"]),
+            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
+            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
+            walreceivers=walreceivers,
+        )
+
+    def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        return self.timeline_status(tenant_id, timeline_id).commit_lsn
+
+    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
+            json=body,
+        )
+        res.raise_for_status()
+
+    # only_local doesn't remove segments in the remote storage.
+    def timeline_delete(
+        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
+    ) -> Dict[Any, Any]:
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            params={
+                "only_local": str(only_local).lower(),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def get_metrics_str(self) -> str:
+        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result.raise_for_status()
+        return request_result.text
+
+    def get_metrics(self) -> SafekeeperMetrics:
+        all_metrics_text = self.get_metrics_str()
+
+        metrics = SafekeeperMetrics()
+        for match in re.finditer(
+            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
+            all_metrics_text,
+            re.MULTILINE,
+        ):
+            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
+                match.group(3)
+            )
+        for match in re.finditer(
+            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
+            all_metrics_text,
+            re.MULTILINE,
+        ):
+            metrics.commit_lsn_inexact[
+                (TenantId(match.group(1)), TimelineId(match.group(2)))
+            ] = int(match.group(3))
+        return metrics
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -0,0 +1,11 @@
+from fixtures.log_helper import log
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.types import TenantId, TimelineId
+
+
+def are_walreceivers_absent(
+    sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+    return len(status.walreceivers) == 0
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,3 +1,4 @@
+import threading
 from typing import Optional

 from fixtures.log_helper import log
@@ -11,6 +12,10 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import TenantId, TimelineId

+# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
+# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
+ENDPOINT_LOCK = threading.Lock()
+

 class Workload:
    """
@@ -41,17 +46,30 @@ class Workload:

        self._endpoint: Optional[Endpoint] = None

+    def reconfigure(self):
+        """
+        Request the endpoint to reconfigure based on location reported by storage controller
+        """
+        if self._endpoint is not None:
+            with ENDPOINT_LOCK:
+                self._endpoint.reconfigure()
+
    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
-        if self._endpoint is None:
-            self._endpoint = self.env.endpoints.create(
-                self.branch_name,
-                tenant_id=self.tenant_id,
-                pageserver_id=pageserver_id,
-                endpoint_id="ep-workload",
-            )
-            self._endpoint.start(pageserver_id=pageserver_id)
-        else:
-            self._endpoint.reconfigure(pageserver_id=pageserver_id)
+        # We may be running alongside other Workloads for different tenants.  Full TTID is
+        # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
+        endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
+
+        with ENDPOINT_LOCK:
+            if self._endpoint is None:
+                self._endpoint = self.env.endpoints.create(
+                    self.branch_name,
+                    tenant_id=self.tenant_id,
+                    pageserver_id=pageserver_id,
+                    endpoint_id=endpoint_id,
+                )
+                self._endpoint.start(pageserver_id=pageserver_id)
+            else:
+                self._endpoint.reconfigure(pageserver_id=pageserver_id)

        connstring = self._endpoint.safe_psql(
            "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -94,7 +112,7 @@ class Workload:
        else:
            return False

-    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
        assert self.expect_rows >= n

        max_iters = 10
@@ -132,22 +150,28 @@ class Workload:
                ]
            )

-        for tenant_shard_id, pageserver in tenant_get_shards(
-            self.env, self.tenant_id, pageserver_id
-        ):
-            last_flush_lsn = wait_for_last_flush_lsn(
-                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-            )
-            ps_http = pageserver.http_client()
-            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+        if ingest:
+            # Wait for written data to be ingested by the pageserver
+            for tenant_shard_id, pageserver in tenant_get_shards(
+                self.env, self.tenant_id, pageserver_id
+            ):
+                last_flush_lsn = wait_for_last_flush_lsn(
+                    self.env,
+                    endpoint,
+                    self.tenant_id,
+                    self.timeline_id,
+                    pageserver_id=pageserver_id,
+                )
+                ps_http = pageserver.http_client()
+                wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)

-            if upload:
-                # force a checkpoint to trigger upload
-                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
-                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-            else:
-                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+                if upload:
+                    # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
+                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+                else:
+                    log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")

    def validate(self, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -56,7 +56,7 @@ def setup_env(
        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
        env.pageserver.tenant_detach(template_tenant)
        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
            ".*Dropped remote consistent LSN updates.*",
        )
        env.pageserver.tenant_attach(template_tenant, config)
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -92,7 +92,7 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
    env.pageserver.tenant_detach(template_tenant)
    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
        ".*Dropped remote consistent LSN updates.*",
    )
    env.pageserver.tenant_attach(template_tenant, config)
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -114,7 +114,7 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
    env.pageserver.tenant_detach(template_tenant)
    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
        ".*Dropped remote consistent LSN updates.*",
    )
    env.pageserver.tenant_attach(template_tenant, config)
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -56,12 +56,12 @@ def measure_recovery_time(env: NeonCompare):
    # Delete the Tenant in the pageserver: this will drop local and remote layers, such that
    # when we "create" the Tenant again, we will replay the WAL from the beginning.
    #
-    # This is a "weird" thing to do, and can confuse the attachment service as we're re-using
+    # This is a "weird" thing to do, and can confuse the storage controller as we're re-using
    # the same tenant ID for a tenant that is logically different from the pageserver's point
    # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
    # we will explicitly create the tenant in the same generation that it was previously
    # attached in.
-    attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant)
+    attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant)
    assert attach_status is not None
    (attach_gen, _) = attach_status

--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -137,7 +137,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
    ps_http.tenant_detach(tenant_id)
    assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]

-    body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
+    body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)}

    ps_http.post(
        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -85,9 +85,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
    # is more like what happens in an unexpected  pageserver failure.
    #
-    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
    # about to start as the attached pageserver
-    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
    env.pageservers[0].start()
    env.pageservers[1].stop()

@@ -97,9 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    assert fetchone() == (100000,)

    env.pageservers[0].stop()
-    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
    # about to start as the attached pageserver
-    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
    env.pageservers[1].start()

    # Test a (former) bug where a child process spins without updating its connection string
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -133,7 +133,7 @@ def test_create_snapshot(
    for sk in env.safekeepers:
        sk.stop()
    env.pageserver.stop()
-    env.attachment_service.stop()
+    env.storage_controller.stop()

    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
    compatibility_snapshot_dir = (
@@ -242,7 +242,7 @@ def test_forward_compatibility(
        # everything else: our test code is written for latest CLI args.
        env.neon_local_binpath = neon_local_binpath

-        neon_env_builder.start()
+        neon_env_builder.start(register_pageservers=True)

        check_neon_works(
            env,
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -4,12 +4,11 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    flush_ep_to_pageserver,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
-from fixtures.utils import query_scalar


 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
@@ -46,14 +45,15 @@ def test_basic_eviction(
            FROM generate_series(1, 5000000) g
            """
        )
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

-    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+    # stops the endpoint
+    current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
+
    client.timeline_checkpoint(tenant_id, timeline_id)
    wait_for_upload(client, tenant_id, timeline_id, current_lsn)

-    # disable compute & sks to avoid on-demand downloads by walreceiver / getpage
-    endpoint.stop()
+    # stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint
+    # has already been stopped by flush_ep_to_pageserver
    for sk in env.safekeepers:
        sk.stop()

--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -1,7 +1,7 @@
 import time

 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 from fixtures.pageserver.types import (
    DeltaLayerFileName,
    ImageLayerFileName,
@@ -115,8 +115,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
                    )
                    == 0
                )
-
-    endpoint.stop()
+    last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)

    wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)

@@ -160,7 +159,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
    time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites

    def get_generation_number():
-        attachment = env.attachment_service.inspect(tenant_id)
+        attachment = env.storage_controller.inspect(tenant_id)
        assert attachment is not None
        return attachment[0]

--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
    # Stop default ps/sk
    env.neon_cli.pageserver_stop(env.pageserver.id)
    env.neon_cli.safekeeper_stop()
-    env.neon_cli.attachment_service_stop(False)
+    env.neon_cli.storage_controller_stop(False)

    # Keep NeonEnv state up to date, it usually owns starting/stopping services
    env.pageserver.running = False
@@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)

    # Stop this to get out of the way of the following `start`
-    env.neon_cli.attachment_service_stop(False)
+    env.neon_cli.storage_controller_stop(False)

    # Default start
    res = env.neon_cli.raw_cli(["start"])
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -8,6 +8,7 @@ from typing import Any, DefaultDict, Dict, Tuple
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    flush_ep_to_pageserver,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -517,7 +518,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:

        with endpoint.cursor() as cur:
            cur.execute("update a set id = -id")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)

    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -73,7 +73,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
    # create new tenant and check it is also there
    tenant_id = TenantId.generate()
    client.tenant_create(
-        tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+        tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
    )
    assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}

--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -203,7 +203,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    env.broker.try_start()
    for sk in env.safekeepers:
        sk.start()
-    env.attachment_service.start()
+    env.storage_controller.start()
+
+    # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
+    env.storage_controller.node_register(env.pageserver)

    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))

@@ -285,7 +288,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)

-    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
    main_pageserver = env.get_pageserver(attached_to_id)
    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]

@@ -310,7 +313,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):

    # Now advance the generation in the control plane: subsequent validations
    # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
    generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)

    assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -366,7 +369,7 @@ def test_deletion_queue_recovery(
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)

-    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
    main_pageserver = env.get_pageserver(attached_to_id)
    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]

@@ -428,7 +431,7 @@ def test_deletion_queue_recovery(

    if keep_attachment == KeepAttachment.LOSE:
        some_other_pageserver = other_pageserver.id
-        env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
+        env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver)

    main_pageserver.start()

@@ -494,7 +497,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    )

    # Simulate a major incident: the control plane goes offline
-    env.attachment_service.stop()
+    env.storage_controller.stop()

    # Remember how many validations had happened before the control plane went offline
    validated = get_deletion_queue_validated(ps_http)
@@ -511,7 +514,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
    env.pageserver.start(
        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
-        register=False,
    )

    # The pageserver should provide service to clients
@@ -525,7 +527,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    assert get_deletion_queue_executed(ps_http) == 0

    # When the control plane comes back up, normal service should resume
-    env.attachment_service.start()
+    env.storage_controller.start()

    ps_http.deletion_queue_flush(execute=True)
    assert get_deletion_queue_depth(ps_http) == 0
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -157,7 +157,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                workload.churn_rows(rng.randint(128, 256), pageserver.id)
                workload.validate(pageserver.id)
            elif last_state_ps[0].startswith("Attached"):
-                # The `attachment_service` will only re-attach on startup when a pageserver was the
+                # The `storage_controller` will only re-attach on startup when a pageserver was the
                # holder of the latest generation: otherwise the pageserver will revert to detached
                # state if it was running attached with a stale generation
                last_state[pageserver.id] = ("Detached", None)
@@ -182,12 +182,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                        generation = last_state_ps[1]
                    else:
                        # Switch generations, while also jumping between attached states
-                        generation = env.attachment_service.attach_hook_issue(
+                        generation = env.storage_controller.attach_hook_issue(
                            tenant_id, pageserver.id
                        )
                        latest_attached = pageserver.id
                else:
-                    generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id)
+                    generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id)
                    latest_attached = pageserver.id
            else:
                generation = None
@@ -273,7 +273,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
    # Encourage the new location to download while still in secondary mode
    pageserver_b.http_client().tenant_secondary_download(tenant_id)

-    migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
+    migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id)
    log.info(f"Acquired generation {migrated_generation} for destination pageserver")
    assert migrated_generation == initial_generation + 1

@@ -436,7 +436,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
    )
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
+    assert env.storage_controller is not None
    assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter

    tenant_id = env.initial_tenant
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -564,3 +564,71 @@ async def test_sql_over_http2(static_proxy: NeonProxy):
        "select 42 as answer", [], user="http", password="http", expected_code=200
    )
    assert resp["rows"] == [{"answer": 42}]
+
+
+def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    # expect to fail with timeout
+    res = static_proxy.http_query(
+        query,
+        [static_proxy.http_timeout_seconds + 1, 1],
+        user="http",
+        password="http",
+        expected_code=400,
+    )
+    assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out"
+
+    time.sleep(2)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"
+
+
+def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    try:
+        # The request should complete before the proxy HTTP timeout triggers.
+        # Timeout and cancel the request on the client side before the query completes.
+        static_proxy.http_query(
+            query,
+            [static_proxy.http_timeout_seconds - 1, 1],
+            user="http",
+            password="http",
+            timeout=2,
+        )
+    except requests.exceptions.ReadTimeout:
+        pass
+
+    # wait until the query _would_ have been complete
+    time.sleep(static_proxy.http_timeout_seconds)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -169,7 +169,7 @@ def test_remote_storage_backup_and_restore(
    # Ensure that even though the tenant is broken, retrying the attachment fails
    with pytest.raises(Exception, match="Tenant state is Broken"):
        # Use same generation as in previous attempt
-        gen_state = env.attachment_service.inspect(tenant_id)
+        gen_state = env.storage_controller.inspect(tenant_id)
        assert gen_state is not None
        generation = gen_state[0]
        env.pageserver.tenant_attach(tenant_id, generation=generation)
@@ -355,7 +355,7 @@ def test_remote_storage_upload_queue_retries(
    env.pageserver.stop(immediate=True)
    env.endpoints.stop_all()

-    # We are about to forcibly drop local dirs.  Attachment service will increment generation in re-attach before
+    # We are about to forcibly drop local dirs.  Storage controller will increment generation in re-attach before
    # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
    # these warnings if there was a durable but un-executed deletion list at time of restart.
    env.pageserver.allowed_errors.extend(
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -80,7 +80,7 @@ def test_tenant_s3_restore(
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
    ), "tenant removed before we deletion was issued"
-    env.attachment_service.attach_hook_drop(tenant_id)
+    env.storage_controller.attach_hook_drop(tenant_id)

    tenant_path = env.pageserver.tenant_dir(tenant_id)
    assert not tenant_path.exists()
@@ -103,7 +103,7 @@ def test_tenant_s3_restore(
        tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
    )

-    generation = env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+    generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)

    ps_http.tenant_attach(tenant_id, generation=generation)
    env.pageserver.quiesce_tenants()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Anna Khanova	27bc242085	Merge pull request #7119 from neondatabase/rc/proxy/2024-03-14 Proxy release 2024-03-14	2024-03-14 14:57:05 +05:00
Anna Khanova	192b49cc6d	Merge branch 'release-proxy' into rc/proxy/2024-03-14	2024-03-14 14:16:36 +05:00
John Spray	44f42627dd	pageserver/controller: error handling for shard splitting (#7074 ) ## Problem Shard splits worked, but weren't safe against failures (e.g. node crash during split) yet. Related: #6676 ## Summary of changes - Introduce async rwlocks at the scope of Tenant and Node: - exclusive tenant lock is used to protect splits - exclusive node lock is used to protect new reconciliation process that happens when setting node active - exclusive locks used in both cases when doing persistent updates (e.g. node scheduling conf) where the update to DB & in-memory state needs to be atomic. - Add failpoints to shard splitting in control plane and pageserver code. - Implement error handling in control plane for shard splits: this detaches child chards and ensures parent shards are re-attached. - Crash-safety for storage controller restarts requires little effort: we already reconcile with nodes over a storage controller restart, so as long as we reset any incomplete splits in the DB on restart (added in this PR), things are implicitly cleaned up. - Implement reconciliation with offline nodes before they transition to active: - (in this context reconciliation means something like startup_reconcile, not literally the Reconciler) - This covers cases where split abort cannot reach a node to clean it up: the cleanup will eventually happen when the node is marked active, as part of reconciliation. - This also covers the case where a node was unavailable when the storage controller started, but becomes available later: previously this allowed it to skip the startup reconcile. - Storage controller now terminates on panics. We only use panics for true "should never happen" assertions, and these cases can leave us in an un-usable state if we keep running (e.g. panicking in a shard split). In the unlikely event that we get into a crashloop as a result, we'll rely on kubernetes to back us off. - Add `test_sharding_split_failures` which exercises a variety of failure cases during shard split.	2024-03-14 09:11:57 +00:00
Conrad Ludgate	3bd6551b36	proxy http cancellation safety (#7117 ) ## Problem hyper auto-cancels the request futures on connection close. `sql_over_http::handle` is not 'drop cancel safe', so we need to do some other work to make sure connections are queries in the right way. ## Summary of changes 1. tokio::spawn the request handler to resolve the initial cancel-safety issue 2. share a cancellation token, and cancel it when the request `Service` is dropped. 3. Add a new log span to be able to track the HTTP connection lifecycle.	2024-03-14 08:20:56 +00:00
Christian Schwarz	69338e53e3	throttling: fixup interactions with Timeline::get_vectored (#7089 ) ## Problem Before this PR, `Timeline::get_vectored` would be throttled twice if the sequential option was enabled or if validation was enabled. Also, `pageserver_get_vectored_seconds` included the time spent in the throttle, which turns out to be undesirable for what we use that metric for. ## Summary of changes Double-throttle: * Add `Timeline::get0` method which is unthrottled. * Use that method from within the `Timeline::get_vectored` code path. Metric: * return throttled time from `throttle()` method * deduct the value from the observed time * globally rate-limited logging of duration subtraction errors, like in all other places that do the throttled-time deduction from observations	2024-03-13 17:49:17 +00:00
Arpad Müller	5309711691	Make tenant_id in TenantLocationConfigRequest optional (#7055 ) The `tenant_id` in `TenantLocationConfigRequest` in the `location_config` endpoint was only used in the storage controller/attachment service, and there it was only used for assertions and the creation part.	2024-03-13 17:30:29 +01:00
Joonas Koivunen	8a53d576e6	fix(metrics): time individual layer flush operations (#7109 ) Currently, the flushing operation could flush multiple frozen layers to the disk and store the aggregate time in the histogram. The result is a bimodal distribution with short and over 1000-second flushes. Change it so that we record how long one layer flush takes.	2024-03-13 15:10:20 +00:00
Anna Khanova	b0aff04157	proxy: add new dimension to exclude cplane latency (#7011 ) ## Problem Currently cplane communication is a part of the latency monitoring. It doesn't allow to setup the proper alerting based on proxy latency. ## Summary of changes Added dimension to exclude cplane latency.	2024-03-13 13:50:05 +01:00
Anna Khanova	0554bee022	proxy: Report warm cold start if connection is from the local cache (#7104 ) ## Problem * quotes in serialized string * no status if connection is from local cache ## Summary of changes * remove quotes * report warm if connection if from local cache	2024-03-13 11:45:19 +00:00
Conrad Ludgate	83855a907c	proxy http error classification (#7098 ) ## Problem Missing error classification for SQL-over-HTTP queries. Not respecting `UserFacingError` for SQL-over-HTTP queries. ## Summary of changes Adds error classification. Adds user facing errors.	2024-03-13 07:35:49 +01:00
John Spray	1b41db8bdd	pageserver: enable setting stripe size inline with split request. (#7093 ) ## Summary - Currently we can set stripe size at tenant creation, but it doesn't mean anything until we have multiple shards - When onboarding an existing tenant, it will always get a default shard stripe size, so we would like to be able to pick the actual stripe size at the point we split. ## Why do this inline with a split? The alternative to this change would be to have a separate endpoint on the storage controller for setting the stripe size on a tenant, and only permit writes to that endpoint when the tenant has only a single shard. That would work, but be a little bit more work for a client, and not appreciably simpler (instead of having a special argument to the split functions, we'd have a special separate endpoint, and a requirement that the controller must sync its config down to the pageserver before calling the split API). Either approach would work, but this one feels a bit more robust end-to-end: the split API is the _very last moment_ that the stripe size is mutable, so if we aim to set it before splitting, it makes sense to do it as part of the same operation.	2024-03-12 20:41:08 +00:00
Jure Bajic	bac06ea1ac	pageserver: fix read path max lsn bug (#7007 ) ## Summary of changes The problem it fixes is when `request_lsn` is `u64::MAX-1` the `cont_lsn` becomes `u64::MAX` which is the same as `prev_lsn` which stops the loop. Closes https://github.com/neondatabase/neon/issues/6812	2024-03-12 16:32:47 +00:00
John Spray	7ae8364b0b	storage controller: register nodes in re-attach request (#7040 ) ## Problem Currently we manually register nodes with the storage controller, and use a script during deploy to register with the cloud control plane. Rather than extend that script further, nodes should just register on startup. ## Summary of changes - Extend the re-attach request to include an optional NodeRegisterRequest - If the `register` field is set, handle it like a normal node registration before executing the normal re-attach work. - Update tests/neon_local that used to rely on doing an explicit register step that could be enabled/disabled. --------- Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-03-12 14:47:12 +00:00
Conrad Ludgate	1f7d54f987	proxy refactor tls listener (#7056 ) ## Problem Now that we have tls-listener vendored, we can refactor and remove a lot of bloated code and make the whole flow a bit simpler ## Summary of changes 1. Remove dead code 2. Move the error handling to inside the `TlsListener` accept() function 3. Extract the peer_addr from the PROXY protocol header and log it with errors	2024-03-12 13:05:40 +00:00
Arthur Petukhovsky	580e136b2e	Forward all backpressure feedback to compute (#7079 ) Previously we aggregated ps_feedback on each safekeeper and sent it to walproposer with every AppendResponse. This PR changes it to send ps_feedback to walproposer right after receiving it from pageserver, without aggregating it in memory. Also contains some preparations for implementing backpressure support for sharding.	2024-03-12 12:14:02 +00:00
Conrad Ludgate	09699d4bd8	proxy: cancel http queries on timeout (#7031 ) ## Problem On HTTP query timeout, we should try and cancel the current in-flight SQL query. ## Summary of changes Trigger a cancellation command in postgres once the timeout is reach	2024-03-12 11:52:00 +00:00
John Spray	89cf714890	tests/neon_local: rename "attachment service" -> "storage controller" (#7087 ) Not a user-facing change, but can break any existing `.neon` directories created by neon_local, as the name of the database used by the storage controller changes. This PR changes all the locations apart from the path of `control_plane/attachment_service` (waiting for an opportune moment to do that one, because it's the most conflict-ish wrt ongoing PRs like #6676 )	2024-03-12 11:36:27 +00:00
Heikki Linnakangas	621ea2ec44	tests: try to make restored-datadir comparison tests not flaky v2 This test occasionally fails with a difference in "pg_xact/0000" file between the local and restored datadirs. My hypothesis is that something changed in the database between the last explicit checkpoint and the shutdown. I suspect autovacuum, it could certainly create transactions. To fix, be more precise about the point in time that we compare. Shut down the endpoint first, then read the last LSN (i.e. the shutdown checkpoint's LSN), from the local disk with pg_controldata. And use exactly that LSN in the basebackup. Closes #559	2024-03-11 23:29:32 +04:00
Heikki Linnakangas	74d09b78c7	Keep walproposer alive until shutdown checkpoint is safe on safekepeers The walproposer pretends to be a walsender in many ways. It has a WalSnd slot, it claims to be a walsender by calling MarkPostmasterChildWalSender() etc. But one different to real walsenders was that the postmaster still treated it as a bgworker rather than a walsender. The difference is that at shutdown, walsenders are not killed until the very end, after the checkpointer process has written the shutdown checkpoint and exited. As a result, the walproposer always got killed before the shutdown checkpoint was written, so the shutdown checkpoint never made it to safekeepers. That's fine in principle, we don't require a clean shutdown after all. But it also feels a bit silly not to stream the shutdown checkpoint. It could be useful for initializing hot standby mode in a read replica, for example. Change postmaster to treat background workers that have called MarkPostmasterChildWalSender() as walsenders. That unfortunately requires another small change in postgres core. After doing that, walproposers stay alive longer. However, it also means that the checkpointer will wait for the walproposer to switch to WALSNDSTATE_STOPPING state, when the checkpointer sends the PROCSIG_WALSND_INIT_STOPPING signal. We don't have the machinery in walproposer to receive and handle that signal reliably. Instead, we mark walproposer as being in WALSNDSTATE_STOPPING always. In commit `568f91420a`, I assumed that shutdown will wait for all the remaining WAL to be streamed to safekeepers, but before this commit that was not true, and the test became flaky. This should make it stable again. Some tests wrongly assumed that no WAL could have been written between pg_current_wal_flush_lsn and quick pg stop after it. Fix them by introducing flush_ep_to_pageserver which first stops the endpoint and then waits till all committed WAL reaches the pageserver. In passing extract safekeeper http client to its own module.	2024-03-11 23:29:32 +04:00
Arseny Sher	0cf0731d8b	SIGQUIT instead of SIGKILL prewarmed postgres. To avoid orphaned processes using wiped datadir with confusing logging.	2024-03-11 22:36:52 +04:00
Sasha Krassovsky	98723844ee	Don't return from inside PG_TRY (#7095 ) ## Problem Returning from PG_TRY is a bug, and we currently do that ## Summary of changes Make it break and then return false. This should also help stabilize test_bad_connection.py	2024-03-11 18:36:39 +00:00
Alex Chi Z	73a8c97ac8	fix: warnings when compiling neon extensions (#7053 ) proceeding https://github.com/neondatabase/neon/pull/7010, close https://github.com/neondatabase/neon/issues/6188 ## Summary of changes This pull request (should) fix all warnings except `-Wdeclaration-after-statement` in the neon extension compilation. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-03-11 17:49:58 +00:00
Christian Schwarz	17a3c9036e	follow-up(#7077 ): adjust flaky-test-detection cutoff date for tokio-epoll-uring (#7090 ) Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2024-03-11 16:36:49 +00:00
Joonas Koivunen	8c5b310090	fix: Layer delete on drop and eviction can outlive timeline shutdown (#7082 ) This is a follow-up to #7051 where `LayerInner::drop` and `LayerInner::evict_blocking` were not noticed to require a gate before the file deletion. The lack of entering a gate opens up a similar possibility of deleting a layer file which a newer Timeline instance has already checked out to be resident in a similar case as #7051.	2024-03-11 16:54:06 +01:00
Christian Schwarz	8224580f3e	fix(tenant/timeline metrics): race condition during shutdown + recreation (#7064 ) Tenant::shutdown or Timeline::shutdown completes and becomes externally observable before the corresponding Tenant/Timeline object is dropped. For example, after observing a Tenant::shutdown to complete, we could attach the same tenant_id again. The shut down Tenant object might still be around at the time of the attach. The race is then the following: - old object's metrics are still around - new object uses with_label_values - old object calls remove_label_values The outcome is that the new object will have the metric objects (they're an Arc internall) but the metrics won't be part of the internal registry and hence they'll be missing in `/metrics`. Later, when the new object gets shut down and tries to remove_label_value, it will observe an error because the metric was already removed by the old object. Changes ------- This PR moves metric removal to `shutdown()`. An alternative design would be to multi-version the metrics using a distinguishing label, or, to use a better metrics crate that allows removing metrics from the registry through the locally held metric handle instead of interacting with the (globally shared) registry. refs https://github.com/neondatabase/neon/pull/7051	2024-03-11 15:41:41 +01:00
Christian Schwarz	2b0f3549f7	default to tokio-epoll-uring in CI tests & on Linux (#7077 ) All of production is using it now as of https://github.com/neondatabase/aws/pull/1121 The change in `flaky_tests.py` resets the flakiness detection logic. The alternative would have been to repeat the choice of io engine in each test name, which would junk up the various test reports too much. --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2024-03-11 14:35:59 +00:00
John Spray	b4972d07d4	storage controller: refactor non-mutable members up into Service (#7086 ) result_tx and compute_hook were in ServiceState (i.e. behind a sync mutex), but didn't need to be. Moving them up into Service removes a bunch of boilerplate clones. While we're here, create a helper `Service::maybe_reconcile_shard` which avoids writing out all the `&self.` arguments to `TenantState::maybe_reconcile` everywhere we call it.	2024-03-11 14:29:32 +00:00
Joonas Koivunen	26ae7b0b3e	fix(metrics): reset TENANT_STATE metric on startup (#7084 ) Otherwise, it might happen that we never get to witness the same state on subsequent restarts, thus the time series will show the value from a few restarts ago. The actual case here was that "Activating" was showing `3` while I was doing tenant migration testing on staging. The number 3 was however from a startup that happened some time ago which had been interrupted by another deployment.	2024-03-11 13:25:53 +00:00
John Spray	f8483cc4a3	pageserver: update swagger for HA APIs (#7070 ) - The type of heatmap_period in tenant config was wrrong - Secondary download and heatmap upload endpoints weren't in swagger.	2024-03-11 09:32:17 +00:00
Conrad Ludgate	cc5d6c66b3	proxy: categorise new cplane error message (#7057 ) ## Problem `422 Unprocessable Entity: compute time quota of non-primary branches is exceeded` being marked as a control plane error. ## Summary of changes Add the manual checks to make this a user error that should not be retried.	2024-03-11 09:20:09 +01:00
Roman Zaynetdinov	d894d2b450	Export db size, deadlocks and changed row metrics (#7050 ) ## Problem We want to report metrics for the oldest user database.	2024-03-11 08:10:04 +00:00
Conrad Ludgate	e1b60f3693	Merge pull request #7041 from neondatabase/rc/proxy/2024-03-07 Proxy release 2024-03-07	2024-03-08 08:19:16 +00:00
Anna Khanova	2804f5323b	Merge pull request #6997 from neondatabase/rc/proxy/2024-03-04 Proxy release 2024-03-04	2024-03-04 17:36:11 +04:00
Anna Khanova	676adc6b32	Merge branch 'release-proxy' into rc/proxy/2024-03-04	2024-03-04 16:41:46 +04:00