Add feature flag RemoteExtensionsUseLatest to compute_ctl.

This will allow us to test new versions of extensions, without waiting for main branch commit
2026-03-11 04:10:36 +00:00 · 2024-02-08 01:40:05 +00:00
98 changed files with 1352 additions and 3711 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1329,6 +1329,8 @@ dependencies = [
 "clap",
 "comfy-table",
 "compute_api",
+ "diesel",
+ "diesel_migrations",
 "futures",
 "git-version",
 "hex",
@@ -4077,7 +4079,6 @@ dependencies = [
 "clap",
 "consumption_metrics",
 "dashmap",
- "env_logger",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4125,7 +4126,6 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
- "smallvec",
 "smol_str",
 "socket2 0.5.5",
 "sync_wrapper",
@@ -4144,7 +4144,6 @@ dependencies = [
 "tracing-subscriber",
 "tracing-utils",
 "url",
- "urlencoding",
 "utils",
 "uuid",
 "walkdir",
@@ -5740,7 +5739,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6265,7 +6264,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
 "io-uring",
 "libc",
@@ -6832,6 +6831,8 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "diesel",
+ "diesel_derives",
 "either",
 "fail",
 "futures-channel",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -171,7 +171,6 @@ tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
-urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
--- a/5
+++ b/5
@@ -100,11 +100,6 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
       -c "listen_pg_addr='0.0.0.0:6400'" \
       -c "listen_http_addr='0.0.0.0:9898'"

-# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
-# that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH /usr/local/v16/lib
-
-
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.76.0
+ENV RUSTC_VERSION=1.75.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -639,8 +639,8 @@ FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
-    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
+    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -809,7 +809,6 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 Neon
-Copyright 2022 - 2024 Neon Inc.
+Copyright 2022 Neon Inc.

 The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
 See vendor/postgres-vX/COPYRIGHT for details.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -765,12 +765,7 @@ impl ComputeNode {
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
-        handle_grants(
-            spec,
-            &mut client,
-            connstr.as_str(),
-            self.has_feature(ComputeFeature::AnonExtension),
-        )?;
+        handle_grants(spec, &mut client, connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;
@@ -844,12 +839,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(
-                &spec,
-                &mut client,
-                self.connstr.as_str(),
-                self.has_feature(ComputeFeature::AnonExtension),
-            )?;
+            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
            handle_extension_neon(&mut client)?;
            // We can skip handle_migrations here because a new migration can only appear
@@ -1245,10 +1235,19 @@ LIMIT 100",

        info!("Downloading to shared preload libraries: {:?}", &libs_vec);

+        let build_tag_str = if spec
+            .features
+            .contains(&ComputeFeature::RemoteExtensionsUseLatest)
+        {
+            "latest"
+        } else {
+            &self.build_tag
+        };
+
        let mut download_tasks = Vec::new();
        for library in &libs_vec {
            let (ext_name, ext_path) =
-                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
+                remote_extensions.get_ext(library, true, build_tag_str, &self.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -8,6 +8,7 @@ use std::thread;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
+use compute_api::spec::ComputeFeature;

 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
@@ -171,12 +172,16 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(
-                    &filename,
-                    is_library,
-                    &compute.build_tag,
-                    &compute.pgversion,
-                )
+                let build_tag_str = if spec
+                    .features
+                    .contains(&ComputeFeature::RemoteExtensionsUseLatest)
+                {
+                    "latest"
+                } else {
+                    &compute.build_tag
+                };
+
+                remote_extensions.get_ext(&filename, is_library, build_tag_str, &compute.pgversion)
            };

            match ext {
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -264,10 +264,9 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    // case we miss some events for some reason. Not strictly necessary, but
    // better safe than sorry.
    let (tx, rx) = std::sync::mpsc::channel();
-    let watcher_res = notify::recommended_watcher(move |res| {
+    let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
        let _ = tx.send(res);
-    });
-    let (mut watcher, rx): (Box<dyn Watcher>, _) = match watcher_res {
+    }) {
        Ok(watcher) => (Box::new(watcher), rx),
        Err(e) => {
            match e.kind {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -581,12 +581,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(
-    spec: &ComputeSpec,
-    client: &mut Client,
-    connstr: &str,
-    enable_anon_extension: bool,
-) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
    info!("modifying database permissions");
    let existing_dbs = get_existing_dbs(client)?;

@@ -683,11 +678,6 @@ pub fn handle_grants(
            inlinify(&grant_query)
        );
        db_client.simple_query(&grant_query)?;
-
-        // it is important to run this after all grants
-        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
-        }
    }

    Ok(())
@@ -776,7 +766,6 @@ BEGIN
    END IF;
 END
 $$;"#,
-        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -820,125 +809,5 @@ $$;"#,
        "Ran {} migrations",
        (migrations.len() - starting_migration_id)
    );
-
-    Ok(())
-}
-
-/// Connect to the database as superuser and pre-create anon extension
-/// if it is present in shared_preload_libraries
-#[instrument(skip_all)]
-pub fn handle_extension_anon(
-    spec: &ComputeSpec,
-    db_owner: &str,
-    db_client: &mut Client,
-    grants_only: bool,
-) -> Result<()> {
-    info!("handle extension anon");
-
-    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
-        if libs.contains("anon") {
-            if !grants_only {
-                // check if extension is already initialized using anon.is_initialized()
-                let query = "SELECT anon.is_initialized()";
-                match db_client.query(query, &[]) {
-                    Ok(rows) => {
-                        if !rows.is_empty() {
-                            let is_initialized: bool = rows[0].get(0);
-                            if is_initialized {
-                                info!("anon extension is already initialized");
-                                return Ok(());
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        warn!(
-                            "anon extension is_installed check failed with expected error: {}",
-                            e
-                        );
-                    }
-                };
-
-                // Create anon extension if this compute needs it
-                // Users cannot create it themselves, because superuser is required.
-                let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
-                info!("creating anon extension with query: {}", query);
-                match db_client.query(query, &[]) {
-                    Ok(_) => {}
-                    Err(e) => {
-                        error!("anon extension creation failed with error: {}", e);
-                        return Ok(());
-                    }
-                }
-
-                // check that extension is installed
-                query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-                let rows = db_client.query(query, &[])?;
-                if rows.is_empty() {
-                    error!("anon extension is not installed");
-                    return Ok(());
-                }
-
-                // Initialize anon extension
-                // This also requires superuser privileges, so users cannot do it themselves.
-                query = "SELECT anon.init()";
-                match db_client.query(query, &[]) {
-                    Ok(_) => {}
-                    Err(e) => {
-                        error!("anon.init() failed with error: {}", e);
-                        return Ok(());
-                    }
-                }
-            }
-
-            // check that extension is installed, if not bail early
-            let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-            match db_client.query(query, &[]) {
-                Ok(rows) => {
-                    if rows.is_empty() {
-                        error!("anon extension is not installed");
-                        return Ok(());
-                    }
-                }
-                Err(e) => {
-                    error!("anon extension check failed with error: {}", e);
-                    return Ok(());
-                }
-            };
-
-            let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-
-            // Grant permissions to db_owner to use anon extension functions
-            let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-
-            // This is needed, because some functions are defined as SECURITY DEFINER.
-            // In Postgres SECURITY DEFINER functions are executed with the privileges
-            // of the owner.
-            // In anon extension this it is needed to access some GUCs, which are only accessible to
-            // superuser. But we've patched postgres to allow db_owner to access them as well.
-            // So we need to change owner of these functions to db_owner.
-            let query = format!("
-                SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
-                from pg_proc p
-                join pg_namespace nsp ON p.pronamespace = nsp.oid
-                where nsp.nspname = 'anon';", db_owner);
-
-            info!("change anon extension functions owner to db owner");
-            db_client.simple_query(&query)?;
-
-            //  affects views as well
-            let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-
-            let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-        }
-    }
-
    Ok(())
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -7,7 +7,6 @@ CREATE TABLE tenant_shards (
  generation INTEGER NOT NULL,
  generation_pageserver BIGINT NOT NULL,
  placement_policy VARCHAR NOT NULL,
-  splitting SMALLINT NOT NULL,
  -- config is JSON encoded, opaque to the database.
  config TEXT NOT NULL
 );
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -3,8 +3,7 @@ use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TimelineCreateRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -42,7 +41,7 @@ pub struct HttpState {

 impl HttpState {
    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
-        let allowlist_routes = ["/status", "/ready", "/metrics"]
+        let allowlist_routes = ["/status"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
@@ -293,19 +292,6 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }

-async fn handle_tenant_shard_split(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
-
-    json_response(
-        StatusCode::OK,
-        service.tenant_shard_split(tenant_id, split_req).await?,
-    )
-}
-
 async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -325,17 +311,6 @@ async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError>
    json_response(StatusCode::OK, ())
 }

-/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
-/// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
-async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&req);
-    if state.service.startup_complete.is_ready() {
-        json_response(StatusCode::OK, ())
-    } else {
-        json_response(StatusCode::SERVICE_UNAVAILABLE, ())
-    }
-}
-
 impl From<ReconcileError> for ApiError {
    fn from(value: ReconcileError) -> Self {
        ApiError::Conflict(format!("Reconciliation error: {}", value))
@@ -391,7 +366,6 @@ pub fn make_router(
        .data(Arc::new(HttpState::new(service, auth)))
        // Non-prefixed generic endpoints (status, metrics)
        .get("/status", |r| request_span(r, handle_status))
-        .get("/ready", |r| request_span(r, handle_ready))
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
            request_span(r, handle_re_attach)
@@ -417,9 +391,6 @@ pub fn make_router(
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
-        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(r, handle_tenant_shard_split)
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,9 +1,7 @@
-pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;

-use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
@@ -365,107 +363,19 @@ impl Persistence {
        Ok(())
    }

-    // When we start shard splitting, we must durably mark the tenant so that
-    // on restart, we know that we must go through recovery.
-    //
-    // We create the child shards here, so that they will be available for increment_generation calls
-    // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
+    // TODO: when we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery (list shards that exist
+    // and pick up where we left off and/or revert to parent shards).
    #[allow(dead_code)]
-    pub(crate) async fn begin_shard_split(
-        &self,
-        old_shard_count: ShardCount,
-        split_tenant_id: TenantId,
-        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> DatabaseResult<()> {
-                // Mark parent shards as splitting
-
-                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
-
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
-                    .set((splitting.eq(1),))
-                    .execute(conn)?;
-                if u8::try_from(updated)
-                    .map_err(|_| DatabaseError::Logical(
-                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != expect_parent_records {
-                    // Perhaps a deletion or another split raced with this attempt to split, mutating
-                    // the parent shards that we intend to split. In this case the split request should fail.
-                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
-                    ));
-                }
-
-                // FIXME: spurious clone to sidestep closure move rules
-                let parent_to_children = parent_to_children.clone();
-
-                // Insert child shards
-                for (parent_shard_id, children) in parent_to_children {
-                    let mut parent = crate::schema::tenant_shards::table
-                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
-                        .load::<TenantShardPersistence>(conn)?;
-                    let parent = if parent.len() != 1 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Parent shard {parent_shard_id} not found"
-                        )));
-                    } else {
-                        parent.pop().unwrap()
-                    };
-                    for mut shard in children {
-                        // Carry the parent's generation into the child
-                        shard.generation = parent.generation;
-
-                        debug_assert!(shard.splitting == SplitState::Splitting);
-                        diesel::insert_into(tenant_shards)
-                            .values(shard)
-                            .execute(conn)?;
-                    }
-                }
-
-                Ok(())
-            })?;
-
-            Ok(())
-        })
-        .await
+    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
    }

-    // When we finish shard splitting, we must atomically clean up the old shards
+    // TODO: when we finish shard splitting, we must atomically clean up the old shards
    // and insert the new shards, and clear the splitting marker.
    #[allow(dead_code)]
-    pub(crate) async fn complete_shard_split(
-        &self,
-        split_tenant_id: TenantId,
-        old_shard_count: ShardCount,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                // Drop parent shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
-                    .execute(conn)?;
-
-                // Clear sharding flag
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
-                debug_assert!(updated > 0);
-
-                Ok(())
-            })?;
-
-            Ok(())
-        })
-        .await
+    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
    }
 }

@@ -493,8 +403,6 @@ pub(crate) struct TenantShardPersistence {
    #[serde(default)]
    pub(crate) placement_policy: String,
    #[serde(default)]
-    pub(crate) splitting: SplitState,
-    #[serde(default)]
    pub(crate) config: String,
 }

--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
@@ -1,46 +0,0 @@
-use diesel::pg::{Pg, PgValue};
-use diesel::{
-    deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
-    sql_types::Int2,
-};
-use serde::{Deserialize, Serialize};
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
-#[diesel(sql_type = SplitStateSQLRepr)]
-#[derive(Deserialize, Serialize)]
-pub enum SplitState {
-    Idle = 0,
-    Splitting = 1,
-}
-
-impl Default for SplitState {
-    fn default() -> Self {
-        Self::Idle
-    }
-}
-
-type SplitStateSQLRepr = Int2;
-
-impl ToSql<SplitStateSQLRepr, Pg> for SplitState {
-    fn to_sql<'a>(
-        &'a self,
-        out: &'a mut diesel::serialize::Output<Pg>,
-    ) -> diesel::serialize::Result {
-        let raw_value: i16 = *self as i16;
-        let mut new_out = out.reborrow();
-        ToSql::<SplitStateSQLRepr, Pg>::to_sql(&raw_value, &mut new_out)
-    }
-}
-
-impl FromSql<SplitStateSQLRepr, Pg> for SplitState {
-    fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result<Self> {
-        match FromSql::<SplitStateSQLRepr, Pg>::from_sql(pg_value).map(|v| match v {
-            0 => Some(Self::Idle),
-            1 => Some(Self::Splitting),
-            _ => None,
-        })? {
-            Some(v) => Ok(v),
-            None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()),
-        }
-    }
-}
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -20,7 +20,6 @@ diesel::table! {
        generation -> Int4,
        generation_pageserver -> Int8,
        placement_policy -> Varchar,
-        splitting -> Int2,
        config -> Text,
    }
 }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,6 +1,5 @@
 use std::{
-    cmp::Ordering,
-    collections::{BTreeMap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap},
    str::FromStr,
    sync::Arc,
    time::{Duration, Instant},
@@ -24,14 +23,13 @@ use pageserver_api::{
    models::{
        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use utils::{
-    backoff,
    completion::Barrier,
    generation::Generation,
    http::error::ApiError,
@@ -42,11 +40,7 @@ use utils::{
 use crate::{
    compute_hook::{self, ComputeHook},
    node::Node,
-    persistence::{
-        split_state::SplitState, DatabaseError, NodePersistence, Persistence,
-        TenantShardPersistence,
-    },
-    reconciler::attached_location_conf,
+    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
    scheduler::Scheduler,
    tenant_state::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -151,71 +145,31 @@ impl Service {
        // indeterminate, same as in [`ObservedStateLocation`])
        let mut observed = HashMap::new();

-        let mut nodes_online = HashSet::new();
-
-        // TODO: give Service a cancellation token for clean shutdown
-        let cancel = CancellationToken::new();
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };

        // TODO: issue these requests concurrently
-        {
-            let nodes = {
-                let locked = self.inner.read().unwrap();
-                locked.nodes.clone()
-            };
-            for node in nodes.values() {
-                let http_client = reqwest::ClientBuilder::new()
-                    .timeout(Duration::from_secs(5))
-                    .build()
-                    .expect("Failed to construct HTTP client");
-                let client = mgmt_api::Client::from_client(
-                    http_client,
-                    node.base_url(),
-                    self.config.jwt_token.as_deref(),
-                );
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());

-                fn is_fatal(e: &mgmt_api::Error) -> bool {
-                    use mgmt_api::Error::*;
-                    match e {
-                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                        ApiError(_, _) => true,
-                    }
+            tracing::info!("Scanning shards on node {}...", node.id);
+            match client.list_location_config().await {
+                Err(e) => {
+                    tracing::warn!("Could not contact pageserver {} ({e})", node.id);
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                }
+                Ok(listing) => {
+                    tracing::info!(
+                        "Received {} shard statuses from pageserver {}, setting it to Active",
+                        listing.tenant_shards.len(),
+                        node.id
+                    );

-                let list_response = backoff::retry(
-                    || client.list_location_config(),
-                    is_fatal,
-                    1,
-                    5,
-                    "Location config listing",
-                    &cancel,
-                )
-                .await;
-                let Some(list_response) = list_response else {
-                    tracing::info!("Shutdown during startup_reconcile");
-                    return;
-                };
-
-                tracing::info!("Scanning shards on node {}...", node.id);
-                match list_response {
-                    Err(e) => {
-                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                        // TODO: be more tolerant, do some retries, in case
-                        // pageserver is being restarted at the same time as we are
-                    }
-                    Ok(listing) => {
-                        tracing::info!(
-                            "Received {} shard statuses from pageserver {}, setting it to Active",
-                            listing.tenant_shards.len(),
-                            node.id
-                        );
-                        nodes_online.insert(node.id);
-
-                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                            observed.insert(tenant_shard_id, (node.id, conf_opt));
-                        }
+                    for (tenant_shard_id, conf_opt) in listing.tenant_shards {
+                        observed.insert(tenant_shard_id, (node.id, conf_opt));
                    }
                }
            }
@@ -226,19 +180,8 @@ impl Service {
        let mut compute_notifications = Vec::new();

        // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let (shard_count, nodes) = {
+        let shard_count = {
            let mut locked = self.inner.write().unwrap();
-
-            // Mark nodes online if they responded to us: nodes are offline by default after a restart.
-            let mut nodes = (*locked.nodes).clone();
-            for (node_id, node) in nodes.iter_mut() {
-                if nodes_online.contains(node_id) {
-                    node.availability = NodeAvailability::Active;
-                }
-            }
-            locked.nodes = Arc::new(nodes);
-            let nodes = locked.nodes.clone();
-
            for (tenant_shard_id, (node_id, observed_loc)) in observed {
                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
                    cleanup.push((tenant_shard_id, node_id));
@@ -270,7 +213,7 @@ impl Service {
                }
            }

-            (locked.tenants.len(), nodes)
+            locked.tenants.len()
        };

        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
@@ -331,8 +274,9 @@ impl Service {
        let stream = futures::stream::iter(compute_notifications.into_iter())
            .map(|(tenant_shard_id, node_id)| {
                let compute_hook = compute_hook.clone();
-                let cancel = cancel.clone();
                async move {
+                    // TODO: give Service a cancellation token for clean shutdown
+                    let cancel = CancellationToken::new();
                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                        tracing::error!(
                            tenant_shard_id=%tenant_shard_id,
@@ -438,7 +382,7 @@ impl Service {
            ))),
            config,
            persistence,
-            startup_complete: startup_complete.clone(),
+            startup_complete,
        });

        let result_task_this = this.clone();
@@ -532,7 +476,6 @@ impl Service {
                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
-                splitting: SplitState::default(),
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -775,7 +718,6 @@ impl Service {
                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
-                splitting: SplitState::default(),
            })
            .collect();
        self.persistence
@@ -1035,10 +977,6 @@ impl Service {
            }
        };

-        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
-        // because otherwise a broken compute hook causes a feedback loop where
-        // location_config returns 500 and gets retried forever.
-
        if let Some(create_req) = maybe_create {
            let create_resp = self.tenant_create(create_req).await?;
            result.shards = create_resp
@@ -1162,7 +1100,6 @@ impl Service {
        self.ensure_attached_wait(tenant_id).await?;

        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -1243,7 +1180,6 @@ impl Service {
        self.ensure_attached_wait(tenant_id).await?;

        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -1416,326 +1352,6 @@ impl Service {
        })
    }

-    pub(crate) async fn tenant_shard_split(
-        &self,
-        tenant_id: TenantId,
-        split_req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
-        let mut policy = None;
-        let mut shard_ident = None;
-
-        // TODO: put a cancellation token on Service for clean shutdown
-        let cancel = CancellationToken::new();
-
-        // A parent shard which will be split
-        struct SplitTarget {
-            parent_id: TenantShardId,
-            node: Node,
-            child_ids: Vec<TenantShardId>,
-        }
-
-        // Validate input, and calculate which shards we will create
-        let (old_shard_count, targets, compute_hook) = {
-            let locked = self.inner.read().unwrap();
-
-            let pageservers = locked.nodes.clone();
-
-            let mut targets = Vec::new();
-
-            // In case this is a retry, count how many already-split shards we found
-            let mut children_found = Vec::new();
-            let mut old_shard_count = None;
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
-                    Ordering::Equal => {
-                        //  Already split this
-                        children_found.push(*tenant_shard_id);
-                        continue;
-                    }
-                    Ordering::Greater => {
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Requested count {} but already have shards at count {}",
-                            split_req.new_shard_count,
-                            shard.shard.count.0
-                        )));
-                    }
-                    Ordering::Less => {
-                        // Fall through: this shard has lower count than requested,
-                        // is a candidate for splitting.
-                    }
-                }
-
-                match old_shard_count {
-                    None => old_shard_count = Some(shard.shard.count),
-                    Some(old_shard_count) => {
-                        if old_shard_count != shard.shard.count {
-                            // We may hit this case if a caller asked for two splits to
-                            // different sizes, before the first one is complete.
-                            // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
-                            // of shard_count=1 and shard_count=2 shards in the map.
-                            return Err(ApiError::Conflict(
-                                "Cannot split, currently mid-split".to_string(),
-                            ));
-                        }
-                    }
-                }
-                if policy.is_none() {
-                    policy = Some(shard.policy.clone());
-                }
-                if shard_ident.is_none() {
-                    shard_ident = Some(shard.shard);
-                }
-
-                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
-                    tracing::info!(
-                        "Tenant shard {} already has shard count {}",
-                        tenant_shard_id,
-                        split_req.new_shard_count
-                    );
-                    continue;
-                }
-
-                let node_id =
-                    shard
-                        .intent
-                        .attached
-                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
-                            "Cannot split a tenant that is not attached"
-                        )))?;
-
-                let node = pageservers
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
-
-                targets.push(SplitTarget {
-                    parent_id: *tenant_shard_id,
-                    node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
-                });
-            }
-
-            if targets.is_empty() {
-                if children_found.len() == split_req.new_shard_count as usize {
-                    return Ok(TenantShardSplitResponse {
-                        new_shards: children_found,
-                    });
-                } else {
-                    // No shards found to split, and no existing children found: the
-                    // tenant doesn't exist at all.
-                    return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
-                    ));
-                }
-            }
-
-            (old_shard_count, targets, locked.compute_hook.clone())
-        };
-
-        // unwrap safety: we would have returned above if we didn't find at least one shard to split
-        let old_shard_count = old_shard_count.unwrap();
-        let shard_ident = shard_ident.unwrap();
-        let policy = policy.unwrap();
-
-        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
-        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
-        // parent shards exist as expected, but it would be neater to do the above pre-checks within the
-        // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
-        // Before creating any new child shards in memory or on the pageservers, persist them: this
-        // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
-        // acts as the protection against two concurrent attempts to split: one of them will get a database
-        // error trying to insert the child shards.
-        let mut child_tsps = Vec::new();
-        for target in &targets {
-            let mut this_child_tsps = Vec::new();
-            for child in &target.child_ids {
-                let mut child_shard = shard_ident;
-                child_shard.number = child.shard_number;
-                child_shard.count = child.shard_count;
-
-                this_child_tsps.push(TenantShardPersistence {
-                    tenant_id: child.tenant_id.to_string(),
-                    shard_number: child.shard_number.0 as i32,
-                    shard_count: child.shard_count.0 as i32,
-                    shard_stripe_size: shard_ident.stripe_size.0 as i32,
-                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
-                    // populate the correct generation as part of its transaction, to protect us
-                    // against racing with changes in the state of the parent.
-                    generation: 0,
-                    generation_pageserver: target.node.id.0 as i64,
-                    placement_policy: serde_json::to_string(&policy).unwrap(),
-                    // TODO: get the config out of the map
-                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
-                    splitting: SplitState::Splitting,
-                });
-            }
-
-            child_tsps.push((target.parent_id, this_child_tsps));
-        }
-
-        if let Err(e) = self
-            .persistence
-            .begin_shard_split(old_shard_count, tenant_id, child_tsps)
-            .await
-        {
-            match e {
-                DatabaseError::Query(diesel::result::Error::DatabaseError(
-                    DatabaseErrorKind::UniqueViolation,
-                    _,
-                )) => {
-                    // Inserting a child shard violated a unique constraint: we raced with another call to
-                    // this function
-                    tracing::warn!("Conflicting attempt to split {tenant_id}: {e}");
-                    return Err(ApiError::Conflict("Tenant is already splitting".into()));
-                }
-                _ => return Err(ApiError::InternalServerError(e.into())),
-            }
-        }
-
-        // FIXME: we have now committed the shard split state to the database, so any subsequent
-        // failure needs to roll it back.  We will later wrap this function in logic to roll back
-        // the split if it fails.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
-        // TODO: issue split calls concurrently (this only matters once we're splitting
-        // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
-
-        for target in &targets {
-            let SplitTarget {
-                parent_id,
-                node,
-                child_ids,
-            } = target;
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-            let response = client
-                .tenant_shard_split(
-                    *parent_id,
-                    TenantShardSplitRequest {
-                        new_shard_count: split_req.new_shard_count,
-                    },
-                )
-                .await
-                .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
-
-            tracing::info!(
-                "Split {} into {}",
-                parent_id,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-
-            if &response.new_shards != child_ids {
-                // This should never happen: the pageserver should agree with us on how shard splits work.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})",
-                    parent_id,
-                    response.new_shards,
-                    child_ids
-                )));
-            }
-        }
-
-        // TODO: if the pageserver restarted concurrently with our split API call,
-        // the actual generation of the child shard might differ from the generation
-        // we expect it to have.  In order for our in-database generation to end up
-        // correct, we should carry the child generation back in the response and apply it here
-        // in complete_shard_split (and apply the correct generation in memory)
-        // (or, we can carry generation in the request and reject the request if
-        //  it doesn't match, but that requires more retry logic on this side)
-
-        self.persistence
-            .complete_shard_split(tenant_id, old_shard_count)
-            .await?;
-
-        // Replace all the shards we just split with their children
-        let mut response = TenantShardSplitResponse {
-            new_shards: Vec::new(),
-        };
-        let mut child_locations = Vec::new();
-        {
-            let mut locked = self.inner.write().unwrap();
-            for target in targets {
-                let SplitTarget {
-                    parent_id,
-                    node: _node,
-                    child_ids,
-                } = target;
-                let (pageserver, generation, config) = {
-                    let old_state = locked
-                        .tenants
-                        .remove(&parent_id)
-                        .expect("It was present, we just split it");
-                    (
-                        old_state.intent.attached.unwrap(),
-                        old_state.generation,
-                        old_state.config.clone(),
-                    )
-                };
-
-                locked.tenants.remove(&parent_id);
-
-                for child in child_ids {
-                    let mut child_shard = shard_ident;
-                    child_shard.number = child.shard_number;
-                    child_shard.count = child.shard_count;
-
-                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
-                    child_observed.insert(
-                        pageserver,
-                        ObservedStateLocation {
-                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
-                        },
-                    );
-
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(Some(pageserver));
-                    child_state.observed = ObservedState {
-                        locations: child_observed,
-                    };
-                    child_state.generation = generation;
-                    child_state.config = config.clone();
-
-                    child_locations.push((child, pageserver));
-
-                    locked.tenants.insert(child, child_state);
-                    response.new_shards.push(child);
-                }
-            }
-        }
-
-        // Send compute notifications for all the new shards
-        let mut failed_notifications = Vec::new();
-        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
-                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
-                        child_id, child_ps);
-                failed_notifications.push(child_id);
-            }
-        }
-
-        // If we failed any compute notifications, make a note to retry later.
-        if !failed_notifications.is_empty() {
-            let mut locked = self.inner.write().unwrap();
-            for failed in failed_notifications {
-                if let Some(shard) = locked.tenants.get_mut(&failed) {
-                    shard.pending_compute_notification = true;
-                }
-            }
-        }
-
-        Ok(response)
-    }
-
    pub(crate) async fn tenant_shard_migrate(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -193,13 +193,6 @@ impl IntentState {
        result
    }

-    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
-        Self {
-            attached: node_id,
-            secondary: vec![],
-        }
-    }
-
    /// When a node goes offline, we update intents to avoid using it
    /// as their attached pageserver.
    ///
@@ -293,9 +286,6 @@ impl TenantState {
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.

-        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
-        // change their attach location.
-
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut used_pageservers = self.intent.all_pageservers();
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,17 +1,20 @@
 use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
-    models::{
-        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
-    },
+    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
    shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::str::FromStr;
+use std::{env, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -267,6 +270,37 @@ impl AttachmentService {
        .expect("non-Unicode path")
    }

+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
    ///
    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
@@ -306,32 +340,69 @@ impl AttachmentService {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "attachment_service";
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }

-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let createdb_path = pg_bin_dir.join("createdb");
-        let output = Command::new(&createdb_path)
-            .args([
-                "-h",
-                "localhost",
-                "-p",
-                &format!("{}", self.postgres_port),
-                &DB_NAME,
-            ])
-            .output()
-            .await
-            .expect("Failed to spawn createdb");
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }

-        if !output.status.success() {
-            let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
-            if stderr.contains("already exists") {
-                tracing::info!("Database {DB_NAME} already exists");
-            } else {
-                anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
            }
        }

+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
        Ok(database_url)
    }

@@ -577,7 +648,7 @@ impl AttachmentService {
    ) -> anyhow::Result<TenantShardMigrateResponse> {
        self.dispatch(
            Method::PUT,
-            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
+            format!("tenant/{tenant_shard_id}/migrate"),
            Some(TenantShardMigrateRequest {
                tenant_shard_id,
                node_id,
@@ -586,20 +657,6 @@ impl AttachmentService {
        .await
    }

-    #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
-    pub async fn tenant_split(
-        &self,
-        tenant_id: TenantId,
-        new_shard_count: u8,
-    ) -> anyhow::Result<TenantShardSplitResponse> {
-        self.dispatch(
-            Method::PUT,
-            format!("control/v1/tenant/{tenant_id}/shard_split"),
-            Some(TenantShardSplitRequest { new_shard_count }),
-        )
-        .await
-    }
-
    #[instrument(skip_all, fields(node_id=%req.node_id))]
    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
        self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -72,6 +72,7 @@ where
    let log_path = datadir.join(format!("{process_name}.log"));
    let process_log_file = fs::OpenOptions::new()
        .create(true)
+        .write(true)
        .append(true)
        .open(&log_path)
        .with_context(|| {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -575,26 +575,6 @@ async fn handle_tenant(
            println!("{tenant_table}");
            println!("{shard_table}");
        }
-        Some(("shard-split", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
-
-            let attachment_service = AttachmentService::from_env(env);
-            let result = attachment_service
-                .tenant_split(tenant_id, shard_count)
-                .await?;
-            println!(
-                "Split tenant {} into shards {}",
-                tenant_id,
-                result
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -1014,13 +994,12 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
            let destroy = sub_args.get_flag("destroy");
-            let mode = sub_args.get_one::<String>("mode").expect("has a default");

            let endpoint = cplane
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(mode, destroy)?;
+            endpoint.stop(destroy)?;
        }

        _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
@@ -1304,7 +1283,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1545,11 +1524,6 @@ fn cli() -> Command {
            .subcommand(Command::new("status")
                .about("Human readable summary of the tenant's shards and attachment locations")
                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("shard-split")
-                .about("Increase the number of shards in the tenant")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1653,16 +1627,7 @@ fn cli() -> Command {
                            .long("destroy")
                            .action(ArgAction::SetTrue)
                            .required(false)
-                    )
-                    .arg(
-                        Arg::new("mode")
-                            .help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
-                            .long("mode")
-                            .action(ArgAction::Set)
-                            .required(false)
-                            .value_parser(["smart", "fast", "immediate"])
-                            .default_value("fast")
-                    )
+                        )
                )

        )
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -761,8 +761,22 @@ impl Endpoint {
        }
    }

-    pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
-        self.pg_ctl(&["-m", mode, "stop"], &None)?;
+    pub fn stop(&self, destroy: bool) -> Result<()> {
+        // If we are going to destroy data directory,
+        // use immediate shutdown mode, otherwise,
+        // shutdown gracefully to leave the data directory sane.
+        //
+        // Postgres is always started from scratch, so stop
+        // without destroy only used for testing and debugging.
+        //
+        self.pg_ctl(
+            if destroy {
+                &["-m", "immediate", "stop"]
+            } else {
+                &["stop"]
+            },
+            &None,
+        )?;

        // Also wait for the compute_ctl process to die. It might have some
        // cleanup work to do after postgres stops, like syncing safekeepers,
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,8 +90,10 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

-    /// Pre-install and initialize anon extension for every database in the cluster
-    AnonExtension,
+    // Use latest version of remote extensions
+    // This is needed to allow us to test new versions of extensions before
+    // they are merged into the main branch.
+    RemoteExtensionsUseLatest,

    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
@@ -155,8 +157,12 @@ impl RemoteExtSpec {
                //
                // Keep it in sync with path generation in
                // https://github.com/neondatabase/build-custom-extensions/tree/main
+                //
+                // if ComputeFeature::RemoteExtensionsUseLatest is enabled
+                // use "latest" as the build_tag
                let archive_path_str =
                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+
                Ok((
                    real_ext_name.to_string(),
                    RemotePath::from_string(&archive_path_str)?,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -192,16 +192,6 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantShardSplitRequest {
-    pub new_shard_count: u8,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantShardSplitResponse {
-    pub new_shards: Vec<TenantShardId>,
-}
-
 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -88,36 +88,12 @@ impl TenantShardId {
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
    pub fn to_index(&self) -> ShardIndex {
        ShardIndex {
            shard_number: self.shard_number,
            shard_count: self.shard_count,
        }
    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
 }

 /// Formatting helper
@@ -817,108 +793,4 @@ mod tests {
        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
        assert_eq!(shard, ShardNumber(8));
    }
-
-    #[test]
-    fn shard_id_split() {
-        let tenant_id = TenantId::generate();
-        let parent = TenantShardId::unsharded(tenant_id);
-
-        // Unsharded into 2
-        assert_eq!(
-            parent.split(ShardCount(2)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(0)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(1)
-                }
-            ]
-        );
-
-        // Unsharded into 4
-        assert_eq!(
-            parent.split(ShardCount(4)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(0)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(1)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(2)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(3)
-                }
-            ]
-        );
-
-        // count=1 into 2 (check this works the same as unsharded.)
-        let parent = TenantShardId {
-            tenant_id,
-            shard_count: ShardCount(1),
-            shard_number: ShardNumber(0),
-        };
-        assert_eq!(
-            parent.split(ShardCount(2)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(0)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(1)
-                }
-            ]
-        );
-
-        // count=2 into count=8
-        let parent = TenantShardId {
-            tenant_id,
-            shard_count: ShardCount(2),
-            shard_number: ShardNumber(1),
-        };
-        assert_eq!(
-            parent.split(ShardCount(8)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(1)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(3)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(5)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(7)
-                },
-            ]
-        );
-    }
 }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -191,7 +191,6 @@ impl RemoteStorage for AzureBlobStorage {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> anyhow::Result<Listing, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -224,8 +223,6 @@ impl RemoteStorage for AzureBlobStorage {

        let mut response = builder.into_stream();
        let mut res = Listing::default();
-        // NonZeroU32 doesn't support subtraction apparently
-        let mut max_keys = max_keys.map(|mk| mk.get());
        while let Some(l) = response.next().await {
            let entry = l.map_err(to_download_error)?;
            let prefix_iter = entry
@@ -238,18 +235,7 @@ impl RemoteStorage for AzureBlobStorage {
                .blobs
                .blobs()
                .map(|k| self.name_to_relative_path(&k.name));
-
-            for key in blob_iter {
-                res.keys.push(key);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(res); // limit reached
-                    }
-                    max_keys = Some(mk);
-                }
-            }
+            res.keys.extend(blob_iter);
        }
        Ok(res)
    }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,15 +13,9 @@ mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
-mod support;

 use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-    time::SystemTime,
+    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
 };

 use anyhow::{bail, Context};
@@ -160,7 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None)
+            .list(prefix, ListingMode::WithDelimiter)
            .await?
            .prefixes;
        Ok(result)
@@ -176,17 +170,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys)
-            .await?
-            .keys;
+    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
        Ok(result)
    }

@@ -194,8 +179,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        &self,
        prefix: Option<&RemotePath>,
        _mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Listing, DownloadError>;
+    ) -> anyhow::Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -285,19 +269,6 @@ impl std::fmt::Display for DownloadError {

 impl std::error::Error for DownloadError {}

-impl DownloadError {
-    /// Returns true if the error should not be retried with backoff
-    pub fn is_permanent(&self) -> bool {
-        use DownloadError::*;
-        match self {
-            BadInput(_) => true,
-            NotFound => true,
-            Cancelled => true,
-            Other(_) => false,
-        }
-    }
-}
-
 #[derive(Debug)]
 pub enum TimeTravelError {
    /// Validation or other error happened due to user input.
@@ -353,31 +324,24 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> anyhow::Result<Listing, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
-            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
-            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
-            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
+            Self::LocalFs(s) => s.list(prefix, mode).await,
+            Self::AwsS3(s) => s.list(prefix, mode).await,
+            Self::AzureBlob(s) => s.list(prefix, mode).await,
+            Self::Unreliable(s) => s.list(prefix, mode).await,
        }
    }

    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::AzureBlob(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,9 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{
-    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
-};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};

 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -20,7 +18,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
+use crate::{
+    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
+};

 use super::{RemoteStorage, StorageMetadata};

@@ -164,7 +164,6 @@ impl RemoteStorage for LocalFs {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> Result<Listing, DownloadError> {
        let mut result = Listing::default();

@@ -181,9 +180,6 @@ impl RemoteStorage for LocalFs {
                    !path.is_dir()
                })
                .collect();
-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }

            return Ok(result);
        }
@@ -369,33 +365,27 @@ impl RemoteStorage for LocalFs {
                    format!("Failed to open source file {target_path:?} to use in the download")
                })
                .map_err(DownloadError::Other)?;
-
-            let len = source
-                .metadata()
-                .await
-                .context("query file length")
-                .map_err(DownloadError::Other)?
-                .len();
-
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
                .context("Failed to seek to the range start in a local storage file")
                .map_err(DownloadError::Other)?;
-
            let metadata = self
                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;

-            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-            let source = ReaderStream::new(source);
-
+            let download_stream: DownloadStream = match end_exclusive {
+                Some(end_exclusive) => Box::pin(ReaderStream::new(
+                    source.take(end_exclusive - start_inclusive),
+                )),
+                None => Box::pin(ReaderStream::new(source)),
+            };
            Ok(Download {
                metadata,
                last_modified: None,
                etag: None,
-                download_stream: Box::pin(source),
+                download_stream,
            })
        } else {
            Err(DownloadError::NotFound)
@@ -524,8 +514,10 @@ mod fs_tests {
    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

-    async fn read_and_check_metadata(
+    async fn read_and_assert_remote_file_contents(
        storage: &LocalFs,
+        #[allow(clippy::ptr_arg)]
+        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
@@ -604,7 +596,7 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

-        let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
+        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
            contents,
@@ -626,7 +618,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        let full_range_download_contents =
-            read_and_check_metadata(&storage, &upload_target, None).await?;
+            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
            full_range_download_contents,
@@ -668,22 +660,6 @@ mod fs_tests {
            "Second part bytes should be returned when requested"
        );

-        let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None)
-            .await?
-            .download_stream;
-        let suffix_bytes = aggregate(suffix_bytes).await?;
-        let suffix = std::str::from_utf8(&suffix_bytes)?;
-        assert_eq!(upload_name, suffix);
-
-        let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None)
-            .await?
-            .download_stream;
-        let all_bytes = aggregate(all_bytes).await?;
-        let all_bytes = std::str::from_utf8(&all_bytes)?;
-        assert_eq!(dummy_contents("upload_1"), all_bytes);
-
        Ok(())
    }

@@ -760,7 +736,7 @@ mod fs_tests {
            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;

        let full_range_download_contents =
-            read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
+            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
        assert_eq!(
            dummy_contents(upload_name),
            full_range_download_contents,
@@ -796,12 +772,12 @@ mod fs_tests {
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;

-        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
+        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
        assert!(listing.prefixes.is_empty());
        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());

        // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
+        let listing = storage.list(None, ListingMode::WithDelimiter).await?;

        assert_eq!(
            listing.prefixes,
@@ -814,7 +790,6 @@ mod fs_tests {
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                ListingMode::WithDelimiter,
-                None,
            )
            .await?;
        assert_eq!(
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -7,7 +7,6 @@
 use std::{
    borrow::Cow,
    collections::HashMap,
-    num::NonZeroU32,
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
@@ -46,9 +45,8 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
-    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -65,6 +63,7 @@ pub struct S3Bucket {
    concurrency_limiter: ConcurrencyLimiter,
 }

+#[derive(Default)]
 struct GetObjectRequest {
    bucket: String,
    key: String,
@@ -233,8 +232,24 @@ impl S3Bucket {

        let started_at = ScopeGuard::into_inner(started_at);

-        let object_output = match get_object {
-            Ok(object_output) => object_output,
+        match get_object {
+            Ok(object_output) => {
+                let metadata = object_output.metadata().cloned().map(StorageMetadata);
+                let etag = object_output.e_tag.clone();
+                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+
+                let body = object_output.body;
+                let body = ByteStreamAsStream::from(body);
+                let body = PermitCarrying::new(permit, body);
+                let body = TimedDownload::new(started_at, body);
+
+                Ok(Download {
+                    metadata,
+                    etag,
+                    last_modified,
+                    download_stream: Box::pin(body),
+                })
+            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
@@ -244,7 +259,7 @@ impl S3Bucket {
                    AttemptOutcome::Ok,
                    started_at,
                );
-                return Err(DownloadError::NotFound);
+                Err(DownloadError::NotFound)
            }
            Err(e) => {
                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
@@ -253,27 +268,11 @@ impl S3Bucket {
                    started_at,
                );

-                return Err(DownloadError::Other(
+                Err(DownloadError::Other(
                    anyhow::Error::new(e).context("download s3 object"),
-                ));
+                ))
            }
-        };
-
-        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output.e_tag;
-        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
-
-        let body = object_output.body;
-        let body = ByteStreamAsStream::from(body);
-        let body = PermitCarrying::new(permit, body);
-        let body = TimedDownload::new(started_at, body);
-
-        Ok(Download {
-            metadata,
-            etag,
-            last_modified,
-            download_stream: Box::pin(body),
-        })
+        }
    }

    async fn delete_oids(
@@ -355,6 +354,33 @@ impl Stream for ByteStreamAsStream {
    // sense and Stream::size_hint does not really
 }

+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct PermitCarrying<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S> PermitCarrying<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        Self { permit, inner }
+    }
+}
+
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
 pin_project_lite::pin_project! {
    /// Times and tracks the outcome of the request.
    struct TimedDownload<S> {
@@ -409,11 +435,8 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
-        // s3 sdk wants i32
-        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
@@ -437,20 +460,13 @@ impl RemoteStorage for S3Bucket {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            // min of two Options, returning Some if one is value and another is
-            // None (None is smaller than anything, so plain min doesn't work).
-            let request_max_keys = self
-                .max_keys_per_list_response
-                .into_iter()
-                .chain(max_keys.into_iter())
-                .min();
            let mut request = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .set_max_keys(request_max_keys);
+                .set_max_keys(self.max_keys_per_list_response);

            if let ListingMode::WithDelimiter = mode {
                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
@@ -480,14 +496,6 @@ impl RemoteStorage for S3Bucket {
                let object_path = object.key().expect("response does not contain a key");
                let remote_path = self.s3_object_to_relative_path(object_path);
                result.keys.push(remote_path);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(result); // limit reached
-                    }
-                    max_keys = Some(mk);
-                }
            }

            result.prefixes.extend(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -4,7 +4,6 @@
 use bytes::Bytes;
 use futures::stream::Stream;
 use std::collections::HashMap;
-use std::num::NonZeroU32;
 use std::sync::Mutex;
 use std::time::SystemTime;
 use std::{collections::hash_map::Entry, sync::Arc};
@@ -61,7 +60,7 @@ impl UnreliableWrapper {
    /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
    /// attempts, let the operation go ahead, and clear the counter.
    ///
-    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
+    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
        let mut attempts = self.attempts.lock().unwrap();

        match attempts.entry(op) {
@@ -79,13 +78,13 @@ impl UnreliableWrapper {
                } else {
                    let error =
                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
-                    Err(error)
+                    Err(DownloadError::Other(error))
                }
            }
            Entry::Vacant(e) => {
                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                e.insert(1);
-                Err(error)
+                Err(DownloadError::Other(error))
            }
        }
    }
@@ -106,30 +105,22 @@ impl RemoteStorage for UnreliableWrapper {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
        self.inner.list_prefixes(prefix).await
    }

-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys).await
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
+        self.inner.list_files(folder).await
    }

    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode, max_keys).await
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list(prefix, mode).await
    }

    async fn upload(
@@ -146,8 +137,7 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        self.attempt(RemoteOp::Download(from.clone()))
-            .map_err(DownloadError::Other)?;
+        self.attempt(RemoteOp::Download(from.clone()))?;
        self.inner.download(from).await
    }

@@ -160,8 +150,7 @@ impl RemoteStorage for UnreliableWrapper {
        // Note: We treat any download_byte_range as an "attempt" of the same
        // operation. We don't pay attention to the ranges. That's good enough
        // for now.
-        self.attempt(RemoteOp::Download(from.clone()))
-            .map_err(DownloadError::Other)?;
+        self.attempt(RemoteOp::Download(from.clone()))?;
        self.inner
            .download_byte_range(from, start_inclusive, end_exclusive)
            .await
@@ -204,7 +193,7 @@ impl RemoteStorage for UnreliableWrapper {
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError> {
        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
-            .map_err(TimeTravelError::Other)?;
+            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
        self.inner
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -1,33 +0,0 @@
-use std::{
-    pin::Pin,
-    task::{Context, Poll},
-};
-
-use futures_util::Stream;
-
-pin_project_lite::pin_project! {
-    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    pub(crate) struct PermitCarrying<S> {
-        permit: tokio::sync::OwnedSemaphorePermit,
-        #[pin]
-        inner: S,
-    }
-}
-
-impl<S> PermitCarrying<S> {
-    pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        Self { permit, inner }
-    }
-}
-
-impl<S: Stream> Stream for PermitCarrying<S> {
-    type Item = <S as Stream>::Item;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        self.project().inner.poll_next(cx)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
-    }
-}
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,8 +1,8 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::RemotePath;
+use std::collections::HashSet;
 use std::sync::Arc;
-use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
 use tracing::debug;

@@ -103,7 +103,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None)
+        .list_files(None)
        .await
        .context("client list root files failure")?
        .into_iter()
@@ -113,17 +113,8 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
        ctx.remote_blobs.clone(),
        "remote storage list_files on root mismatches with the uploads."
    );
-
-    // Test that max_keys limit works. In total there are about 21 files (see
-    // upload_simple_remote_data call in test_real_s3.rs).
-    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
-        .await
-        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
-
    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None)
+        .list_files(Some(&base_prefix))
        .await
        .context("client list nested files failure")?
        .into_iter()
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -70,7 +70,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    }

    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None))
+        Ok(retry(|| client.list_files(None))
            .await
            .context("list root files failure")?
            .into_iter()
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -27,11 +27,6 @@ impl Barrier {
            b.wait().await
        }
    }
-
-    /// Return true if a call to wait() would complete immediately
-    pub fn is_ready(&self) -> bool {
-        futures::future::FutureExt::now_or_never(self.0.wait()).is_some()
-    }
 }

 impl PartialEq for Barrier {
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,6 +1,6 @@
 use std::sync::{
    atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
+    Arc,
 };
 use tokio::sync::Semaphore;

@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
+    inner: tokio::sync::RwLock<Inner<T>>,
    initializers: AtomicUsize,
 }

@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
        let sem = Semaphore::new(1);
        sem.close();
        Self {
-            inner: Mutex::new(Inner {
+            inner: tokio::sync::RwLock::new(Inner {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
@@ -61,18 +61,18 @@ impl<T> OnceCell<T> {
    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
    /// returning the guard.
    ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    /// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
    ///
    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
    where
        F: FnOnce(InitPermit) -> Fut,
        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
    {
        let sem = {
-            let guard = self.inner.lock().unwrap();
+            let guard = self.inner.write().await;
            if guard.value.is_some() {
-                return Ok(Guard(guard));
+                return Ok(GuardMut(guard));
            }
            guard.init_semaphore.clone()
        };
@@ -88,29 +88,72 @@ impl<T> OnceCell<T> {
                let permit = InitPermit(permit);
                let (value, _permit) = factory(permit).await?;

-                let guard = self.inner.lock().unwrap();
+                let guard = self.inner.write().await;

                Ok(Self::set0(value, guard))
            }
            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
+                let guard = self.inner.write().await;
                assert!(
                    guard.value.is_some(),
                    "semaphore got closed, must be initialized"
                );
-                return Ok(Guard(guard));
+                return Ok(GuardMut(guard));
            }
        }
    }

-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
+    where
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+    {
+        let sem = {
+            let guard = self.inner.read().await;
+            if guard.value.is_some() {
+                return Ok(GuardRef(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };
+
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.write().await;
+
+                Ok(Self::set0(value, guard).downgrade())
+            }
+            Err(_closed) => {
+                let guard = self.inner.read().await;
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(GuardRef(guard));
+            }
+        }
+    }
+
+    /// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
    /// # Panics
    ///
    /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
+    pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
+        let guard = self.inner.write().await;

        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
        // give more permits right now.
@@ -122,21 +165,31 @@ impl<T> OnceCell<T> {
        Self::set0(value, guard)
    }

-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+    fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
        if guard.value.is_some() {
            drop(guard);
            unreachable!("we won permit, must not be initialized");
        }
        guard.value = Some(value);
        guard.init_semaphore.close();
-        Guard(guard)
+        GuardMut(guard)
    }

    /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
+    pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
+        let guard = self.inner.write().await;
        if guard.value.is_some() {
-            Some(Guard(guard))
+            Some(GuardMut(guard))
+        } else {
+            None
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub async fn get(&self) -> Option<GuardRef<'_, T>> {
+        let guard = self.inner.read().await;
+        if guard.value.is_some() {
+            Some(GuardRef(guard))
        } else {
            None
        }
@@ -168,9 +221,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);

-impl<T> std::ops::Deref for Guard<'_, T> {
+impl<T> std::ops::Deref for GuardMut<'_, T> {
    type Target = T;

    fn deref(&self) -> &Self::Target {
@@ -181,7 +234,7 @@ impl<T> std::ops::Deref for Guard<'_, T> {
    }
 }

-impl<T> std::ops::DerefMut for Guard<'_, T> {
+impl<T> std::ops::DerefMut for GuardMut<'_, T> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        self.0
            .value
@@ -190,7 +243,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
    }
 }

-impl<'a, T> Guard<'a, T> {
+impl<'a, T> GuardMut<'a, T> {
    /// Take the current value, and a new permit for it's deinitialization.
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
@@ -208,6 +261,24 @@ impl<'a, T> Guard<'a, T> {
            .map(|v| (v, InitPermit(permit)))
            .expect("guard is not created unless value has been initialized")
    }
+
+    pub fn downgrade(self) -> GuardRef<'a, T> {
+        GuardRef(self.0.downgrade())
+    }
+}
+
+#[derive(Debug)]
+pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for GuardRef<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
 }

 /// Type held by OnceCell (de)initializing task.
@@ -248,7 +319,7 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_or_init(|permit| {
+                            .get_mut_or_init(|permit| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
@@ -295,7 +366,11 @@ mod tests {
            let cell = cell.clone();
            let deinitialization_started = deinitialization_started.clone();
            async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                let (answer, _permit) = cell
+                    .get_mut()
+                    .await
+                    .expect("initialized to value")
+                    .take_and_deinit();
                assert_eq!(answer, initial);

                deinitialization_started.wait().await;
@@ -306,7 +381,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
            .await
            .unwrap();

@@ -318,21 +393,21 @@ mod tests {

        jh.await.unwrap();

-        assert_eq!(*cell.get().unwrap(), reinit);
+        assert_eq!(*cell.get_mut().await.unwrap(), reinit);
    }

-    #[test]
-    fn reinit_with_deinit_permit() {
+    #[tokio::test]
+    async fn reinit_with_deinit_permit() {
        let cell = Arc::new(OnceCell::new(42));

-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
+        let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
+        cell.set(5, permit).await;
+        assert_eq!(*cell.get_mut().await.unwrap(), 5);

-        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
        assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
+        cell.set(mol, permit).await;
+        assert_eq!(*cell.get_mut().await.unwrap(), 42);
    }

    #[tokio::test]
@@ -340,13 +415,13 @@ mod tests {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
+            cell.get_mut_or_init(|_permit| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -358,7 +433,7 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_or_init(|permit| async {
+        let initializer = cell.get_mut_or_init(|permit| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

@@ -372,10 +447,10 @@ mod tests {

        // now initializer is dropped

-        assert!(cell.get().is_none());
+        assert!(cell.get_mut().await.is_none());

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -56,18 +56,10 @@ pub enum ForceAwaitLogicalSize {

 impl Client {
    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
-        Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
-    }
-
-    pub fn from_client(
-        client: reqwest::Client,
-        mgmt_api_endpoint: String,
-        jwt: Option<&str>,
-    ) -> Self {
        Self {
            mgmt_api_endpoint,
            authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
-            client,
+            client: reqwest::Client::new(),
        }
    }

@@ -318,22 +310,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn tenant_shard_split(
-        &self,
-        tenant_shard_id: TenantShardId,
-        req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse> {
-        let uri = format!(
-            "{}/v1/tenant/{}/shard_split",
-            self.mgmt_api_endpoint, tenant_shard_id
-        );
-        self.request(Method::PUT, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn timeline_list(
        &self,
        tenant_shard_id: &TenantShardId,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -274,10 +274,6 @@ fn start_pageserver(
    set_launch_timestamp_metric(launch_ts);
    #[cfg(target_os = "linux")]
    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
-    metrics::register_internal(Box::new(
-        pageserver::metrics::tokio_epoll_uring::Collector::new(),
-    ))
-    .unwrap();
    pageserver::preinitialize_metrics();

    // If any failpoints were set from FAILPOINTS environment variable,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -623,7 +623,6 @@ impl std::fmt::Display for EvictionLayer {
    }
 }

-#[derive(Default)]
 pub(crate) struct DiskUsageEvictionInfo {
    /// Timeline's largest layer (remote or resident)
    pub max_layer_size: Option<u64>,
@@ -855,27 +854,19 @@ async fn collect_eviction_candidates(

        let total = tenant_candidates.len();

-        let tenant_candidates =
-            tenant_candidates
-                .into_iter()
-                .enumerate()
-                .map(|(i, mut candidate)| {
-                    // as we iterate this reverse sorted list, the most recently accessed layer will always
-                    // be 1.0; this is for us to evict it last.
-                    candidate.relative_last_activity =
-                        eviction_order.relative_last_activity(total, i);
+        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
+            // as we iterate this reverse sorted list, the most recently accessed layer will always
+            // be 1.0; this is for us to evict it last.
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);

-                    let partition = if cumsum > min_resident_size as i128 {
-                        MinResidentSizePartition::Above
-                    } else {
-                        MinResidentSizePartition::Below
-                    };
-                    cumsum += i128::from(candidate.layer.get_file_size());
-
-                    (partition, candidate)
-                });
-
-        candidates.extend(tenant_candidates);
+            let partition = if cumsum > min_resident_size as i128 {
+                MinResidentSizePartition::Above
+            } else {
+                MinResidentSizePartition::Below
+            };
+            cumsum += i128::from(candidate.layer.get_file_size());
+            candidates.push((partition, candidate));
+        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -891,41 +882,21 @@ async fn collect_eviction_candidates(
    );

    for secondary_tenant in secondary_tenants {
-        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
-        // to prevent repeated disk usage based evictions from completely draining less often
-        // updating secondaries.
-        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
-
-        debug_assert!(
-            total_layers >= layer_info.resident_layers.len(),
-            "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
-            layer_info.resident_layers.len()
-        );
+        let mut layer_info = secondary_tenant.get_layers_for_eviction();

        layer_info
            .resident_layers
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));

-        let tenant_candidates =
-            layer_info
-                .resident_layers
-                .into_iter()
-                .enumerate()
-                .map(|(i, mut candidate)| {
-                    candidate.relative_last_activity =
-                        eviction_order.relative_last_activity(total_layers, i);
-                    (
-                        // Secondary locations' layers are always considered above the min resident size,
-                        // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                        // the layers have sufficiently old access times.
-                        MinResidentSizePartition::Above,
-                        candidate,
-                    )
-                });
-
-        candidates.extend(tenant_candidates);
-
-        tokio::task::yield_now().await;
+        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
+            (
+                // Secondary locations' layers are always considered above the min resident size,
+                // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                // the layers have sufficiently old access times.
+                MinResidentSizePartition::Above,
+                candidate,
+            )
+        }));
    }

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,14 +19,11 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantShardLocation;
-use pageserver_api::models::TenantShardSplitRequest;
-use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
 };
-use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
@@ -878,7 +875,7 @@ async fn tenant_reset_handler(
    let state = get_state(&request);
    state
        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -1107,25 +1104,6 @@ async fn tenant_size_handler(
    )
 }

-async fn tenant_shard_split_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let req: TenantShardSplitRequest = json_request(&mut request).await?;
-
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let state = get_state(&request);
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let new_shards = state
-        .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
-}
-
 async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2085,9 +2063,6 @@ pub fn make_router(
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
-        .put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
-            api_handler(r, tenant_shard_split_handler)
-        })
        .get("/v1/tenant/:tenant_shard_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2400,72 +2400,6 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
    }
 }

-pub mod tokio_epoll_uring {
-    use metrics::UIntGauge;
-
-    pub struct Collector {
-        descs: Vec<metrics::core::Desc>,
-        systems_created: UIntGauge,
-        systems_destroyed: UIntGauge,
-    }
-
-    const NMETRICS: usize = 2;
-
-    impl metrics::core::Collector for Collector {
-        fn desc(&self) -> Vec<&metrics::core::Desc> {
-            self.descs.iter().collect()
-        }
-
-        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(NMETRICS);
-            let tokio_epoll_uring::metrics::Metrics {
-                systems_created,
-                systems_destroyed,
-            } = tokio_epoll_uring::metrics::global();
-            self.systems_created.set(systems_created);
-            mfs.extend(self.systems_created.collect());
-            self.systems_destroyed.set(systems_destroyed);
-            mfs.extend(self.systems_destroyed.collect());
-            mfs
-        }
-    }
-
-    impl Collector {
-        #[allow(clippy::new_without_default)]
-        pub fn new() -> Self {
-            let mut descs = Vec::new();
-
-            let systems_created = UIntGauge::new(
-                "pageserver_tokio_epoll_uring_systems_created",
-                "counter of tokio-epoll-uring systems that were created",
-            )
-            .unwrap();
-            descs.extend(
-                metrics::core::Collector::desc(&systems_created)
-                    .into_iter()
-                    .cloned(),
-            );
-
-            let systems_destroyed = UIntGauge::new(
-                "pageserver_tokio_epoll_uring_systems_destroyed",
-                "counter of tokio-epoll-uring systems that were destroyed",
-            )
-            .unwrap();
-            descs.extend(
-                metrics::core::Collector::desc(&systems_destroyed)
-                    .into_iter()
-                    .cloned(),
-            );
-
-            Self {
-                descs,
-                systems_created,
-                systems_destroyed,
-            }
-        }
-    }
-}
-
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -576,8 +576,8 @@ pub fn shutdown_token() -> CancellationToken {

 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
-        true_or_false
+    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
+        cancel.is_cancelled()
    } else {
        if !cfg!(test) {
            warn!("is_shutdown_requested() called in an unexpected task or thread");
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -53,7 +53,6 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
-use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
@@ -1377,7 +1376,7 @@ impl Tenant {
                async move {
                    debug!("starting index part download");

-                    let index_part = client.download_index_file(&cancel_clone).await;
+                    let index_part = client.download_index_file(cancel_clone).await;

                    debug!("finished index part download");

@@ -2398,67 +2397,6 @@ impl Tenant {
    pub(crate) fn get_generation(&self) -> Generation {
        self.generation
    }
-
-    /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
-    /// and can leave the tenant in a bad state if it fails.  The caller is responsible for
-    /// resetting this tenant to a valid state if we fail.
-    pub(crate) async fn split_prepare(
-        &self,
-        child_shards: &Vec<TenantShardId>,
-    ) -> anyhow::Result<()> {
-        let timelines = self.timelines.lock().unwrap().clone();
-        for timeline in timelines.values() {
-            let Some(tl_client) = &timeline.remote_client else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            let Some(remote_storage) = &self.remote_storage else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
-            // to ensure that they do not start a split if currently in the process of doing these.
-
-            // Upload an index from the parent: this is partly to provide freshness for the
-            // child tenants that will copy it, and partly for general ease-of-debugging: there will
-            // always be a parent shard index in the same generation as we wrote the child shard index.
-            tl_client.schedule_index_upload_for_file_changes()?;
-            tl_client.wait_completion().await?;
-
-            // Shut down the timeline's remote client: this means that the indices we write
-            // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await?;
-
-            // Download methods can still be used after shutdown, as they don't flow through the remote client's
-            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
-            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
-            // we use here really is the remotely persistent one).
-            let result = tl_client
-                .download_index_file(&self.cancel)
-                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
-                .await?;
-            let index_part = match result {
-                MaybeDeletedIndexPart::Deleted(_) => {
-                    anyhow::bail!("Timeline deletion happened concurrently with split")
-                }
-                MaybeDeletedIndexPart::IndexPart(p) => p,
-            };
-
-            for child_shard in child_shards {
-                upload_index_part(
-                    remote_storage,
-                    child_shard,
-                    &timeline.timeline_id,
-                    self.generation,
-                    &index_part,
-                    &self.cancel,
-                )
-                .await?;
-            }
-        }
-
-        Ok(())
-    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -3794,10 +3732,6 @@ impl Tenant {

        Ok(())
    }
-
-    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
-    }
 }

 fn remove_timeline_and_uninit_mark(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
@@ -23,7 +22,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
-use utils::{completion, crashsafe};
+use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -645,6 +644,8 @@ pub(crate) async fn shutdown_all_tenants() {
 }

 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
+    use utils::completion;
+
    let mut join_set = JoinSet::new();

    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
@@ -1199,7 +1200,7 @@ impl TenantManager {
        &self,
        tenant_shard_id: TenantShardId,
        drop_cache: bool,
-        ctx: &RequestContext,
+        ctx: RequestContext,
    ) -> anyhow::Result<()> {
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        let Some(old_slot) = slot_guard.get_old_value() else {
@@ -1252,7 +1253,7 @@ impl TenantManager {
            None,
            self.tenants,
            SpawnMode::Normal,
-            ctx,
+            &ctx,
        )?;

        slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1374,164 +1375,6 @@ impl TenantManager {
        slot_guard.revert();
        result
    }
-
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
-    pub(crate) async fn shard_split(
-        &self,
-        tenant_shard_id: TenantShardId,
-        new_shard_count: ShardCount,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant = get_tenant(tenant_shard_id, true)?;
-
-        // Plan: identify what the new child shards will be
-        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
-        if new_shard_count <= ShardCount(effective_old_shard_count) {
-            anyhow::bail!("Requested shard count is not an increase");
-        }
-        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
-        if !expansion_factor.is_power_of_two() {
-            anyhow::bail!("Requested split is not a power of two");
-        }
-
-        let parent_shard_identity = tenant.shard_identity;
-        let parent_tenant_conf = tenant.get_tenant_conf();
-        let parent_generation = tenant.generation;
-
-        let child_shards = tenant_shard_id.split(new_shard_count);
-        tracing::info!(
-            "Shard {} splits into: {}",
-            tenant_shard_id.to_index(),
-            child_shards
-                .iter()
-                .map(|id| format!("{}", id.to_index()))
-                .join(",")
-        );
-
-        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
-        if let Err(e) = tenant.split_prepare(&child_shards).await {
-            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
-            // have been left in a partially-shut-down state.
-            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
-            self.reset_tenant(tenant_shard_id, false, ctx).await?;
-            return Err(e);
-        }
-
-        self.resources.deletion_queue_client.flush_advisory();
-
-        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
-        drop(tenant);
-        let mut parent_slot_guard =
-            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let parent = match parent_slot_guard.get_old_value() {
-            Some(TenantSlot::Attached(t)) => t,
-            Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
-            Some(TenantSlot::InProgress(_)) => {
-                // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
-                // it would return an error.
-                unreachable!()
-            }
-            None => {
-                // We don't actually need the parent shard to still be attached to do our work, but it's
-                // a weird enough situation that the caller probably didn't want us to continue working
-                // if they had detached the tenant they requested the split on.
-                anyhow::bail!("Detached parent shard in the middle of split!")
-            }
-        };
-
-        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
-        // TODO: erase the dentries from the parent
-
-        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
-        // child shards to reach this point.
-        let mut target_lsns = HashMap::new();
-        for timeline in parent.timelines.lock().unwrap().clone().values() {
-            target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
-        }
-
-        // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
-        // and could slow down the children trying to catch up.
-
-        // Phase 3: Spawn the child shards
-        for child_shard in &child_shards {
-            let mut child_shard_identity = parent_shard_identity;
-            child_shard_identity.count = child_shard.shard_count;
-            child_shard_identity.number = child_shard.shard_number;
-
-            let child_location_conf = LocationConf {
-                mode: LocationMode::Attached(AttachedLocationConfig {
-                    generation: parent_generation,
-                    attach_mode: AttachmentMode::Single,
-                }),
-                shard: child_shard_identity,
-                tenant_conf: parent_tenant_conf,
-            };
-
-            self.upsert_location(
-                *child_shard,
-                child_location_conf,
-                None,
-                SpawnMode::Normal,
-                ctx,
-            )
-            .await?;
-        }
-
-        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
-        for child_shard_id in &child_shards {
-            let child_shard = {
-                let locked = TENANTS.read().unwrap();
-                let peek_slot =
-                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
-                peek_slot.and_then(|s| s.get_attached()).cloned()
-            };
-            if let Some(t) = child_shard {
-                let timelines = t.timelines.lock().unwrap().clone();
-                for timeline in timelines.values() {
-                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
-                        continue;
-                    };
-
-                    tracing::info!(
-                        "Waiting for child shard {}/{} to reach target lsn {}...",
-                        child_shard_id,
-                        timeline.timeline_id,
-                        target_lsn
-                    );
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
-                        // Failure here might mean shutdown, in any case this part is an optimization
-                        // and we shouldn't hold up the split operation.
-                        tracing::warn!(
-                            "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
-                            timeline.timeline_id
-                        );
-                    } else {
-                        tracing::info!(
-                            "Child shard {}/{} reached target lsn {}",
-                            child_shard_id,
-                            timeline.timeline_id,
-                            target_lsn
-                        );
-                    }
-                }
-            }
-        }
-
-        // Phase 5: Shut down the parent shard.
-        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
-            Ok(()) => {}
-            Err(other) => {
-                other.wait().await;
-            }
-        }
-        parent_slot_guard.drop_old_value()?;
-
-        // Phase 6: Release the InProgress on the parent shard
-        drop(parent_slot_guard);
-
-        Ok(child_shards)
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2366,6 +2209,8 @@ async fn remove_tenant_from_memory<V, F>(
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
+    use utils::completion;
+
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,7 +217,6 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
@@ -263,11 +262,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;

-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -331,6 +325,11 @@ pub struct RemoteTimelineClient {
    cancel: CancellationToken,
 }

+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
 /// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
 ///
 /// This is a convenience for the various upload functions.  In future
@@ -507,7 +506,7 @@ impl RemoteTimelineClient {
    /// Download index file
    pub async fn download_index_file(
        &self,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
@@ -1148,17 +1147,22 @@ impl RemoteTimelineClient {

        let cancel = shutdown_token();

-        let remaining = download_retry(
+        let remaining = backoff::retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None)
+                    .list_files(Some(&timeline_storage_path))
                    .await
            },
-            "list remaining files",
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
            &cancel,
        )
        .await
-        .context("list files remaining files")?;
+        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
+        .and_then(|x| x)
+        .context("list prefixes")?;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -1347,7 +1351,6 @@ impl RemoteTimelineClient {
    /// queue.
    ///
    async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
-        let cancel = shutdown_token();
        // Loop to retry until it completes.
        loop {
            // If we're requested to shut down, close up shop and exit.
@@ -1359,7 +1362,7 @@ impl RemoteTimelineClient {
            // the Future, but we're not 100% sure if the remote storage library
            // is cancellation safe, so we don't dare to do that. Hopefully, the
            // upload finishes or times out soon enough.
-            if cancel.is_cancelled() {
+            if task_mgr::is_shutdown_requested() {
                info!("upload task cancelled by shutdown request");
                match self.stop() {
                    Ok(()) => {}
@@ -1470,7 +1473,7 @@ impl RemoteTimelineClient {
                        retries,
                        DEFAULT_BASE_BACKOFF_SECONDS,
                        DEFAULT_MAX_BACKOFF_SECONDS,
-                        &cancel,
+                        &shutdown_token(),
                    )
                    .await;
                }
@@ -1987,7 +1990,7 @@ mod tests {

        // Download back the index.json, and check that the list of files is correct
        let initial_index_part = match client
-            .download_index_file(&CancellationToken::new())
+            .download_index_file(CancellationToken::new())
            .await
            .unwrap()
        {
@@ -2081,7 +2084,7 @@ mod tests {

        // Download back the index.json, and check that the list of files is correct
        let index_part = match client
-            .download_index_file(&CancellationToken::new())
+            .download_index_file(CancellationToken::new())
            .await
            .unwrap()
        {
@@ -2283,7 +2286,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file(&CancellationToken::new())
+            .download_index_file(CancellationToken::new())
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -216,15 +216,16 @@ pub async fn list_remote_timelines(
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

+    let cancel_inner = cancel.clone();
    let listing = download_retry_forever(
        || {
            download_cancellable(
-                &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
+                &cancel_inner,
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
            )
        },
        &format!("list timelines for {tenant_shard_id}"),
-        &cancel,
+        cancel,
    )
    .await?;

@@ -257,18 +258,19 @@ async fn do_download_index_part(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    index_generation: Generation,
-    cancel: &CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    use futures::stream::StreamExt;

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

+    let cancel_inner = cancel.clone();
    let index_part_bytes = download_retry_forever(
        || async {
            // Cancellation: if is safe to cancel this future because we're just downloading into
            // a memory buffer, not touching local disk.
            let index_part_download =
-                download_cancellable(cancel, storage.download(&remote_path)).await?;
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;

            let mut index_part_bytes = Vec::new();
            let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -286,7 +288,7 @@ async fn do_download_index_part(
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
+        .with_context(|| format!("download index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

    Ok(index_part)
@@ -303,7 +305,7 @@ pub(super) async fn download_index_part(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
-    cancel: &CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -323,8 +325,14 @@ pub(super) async fn download_index_part(
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
+    let res = do_download_index_part(
+        storage,
+        tenant_shard_id,
+        timeline_id,
+        my_generation,
+        cancel.clone(),
+    )
+    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -349,7 +357,7 @@ pub(super) async fn download_index_part(
        tenant_shard_id,
        timeline_id,
        my_generation.previous(),
-        cancel,
+        cancel.clone(),
    )
    .await;
    match res {
@@ -371,13 +379,18 @@ pub(super) async fn download_index_part(
    // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
    // to constructing a full index path with no generation, because the generation is a suffix.
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-
-    let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None).await },
-        "list index_part files",
-        cancel,
+    let indices = backoff::retry(
+        || async { storage.list_files(Some(&index_prefix)).await },
+        |_| false,
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "listing index_part files",
+        &cancel,
    )
-    .await?;
+    .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
+    .map_err(DownloadError::Other)?;

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
@@ -434,6 +447,8 @@ pub(crate) async fn download_initdb_tar_zst(
        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
    ));

+    let cancel_inner = cancel.clone();
+
    let file = download_retry(
        || async {
            let file = OpenOptions::new()
@@ -446,11 +461,13 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
+            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
+                .await
            {
                Ok(dl) => dl,
                Err(DownloadError::NotFound) => {
-                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
+                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
+                        .await?
                }
                Err(other) => Err(other)?,
            };
@@ -499,7 +516,7 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-pub(super) async fn download_retry<T, O, F>(
+async fn download_retry<T, O, F>(
    op: O,
    description: &str,
    cancel: &CancellationToken,
@@ -510,7 +527,7 @@ where
 {
    backoff::retry(
        op,
-        DownloadError::is_permanent,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
@@ -524,7 +541,7 @@ where
 async fn download_retry_forever<T, O, F>(
    op: O,
    description: &str,
-    cancel: &CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
@@ -532,11 +549,11 @@ where
 {
    backoff::retry(
        op,
-        DownloadError::is_permanent,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        u32::MAX,
        description,
-        cancel,
+        &cancel,
    )
    .await
    .ok_or_else(|| DownloadError::Cancelled)
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
 use tracing::info;

 /// Serializes and uploads the given index part data to the remote storage.
-pub(crate) async fn upload_index_part<'a>(
+pub(super) async fn upload_index_part<'a>(
    storage: &'a GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -160,7 +160,7 @@ impl SecondaryTenant {
        &self.tenant_shard_id
    }

-    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
        self.detail.lock().unwrap().get_layers_for_eviction(self)
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -146,15 +146,14 @@ impl SecondaryDetail {
        }
    }

-    /// Additionally returns the total number of layers, used for more stable relative access time
-    /// based eviction.
    pub(super) fn get_layers_for_eviction(
        &self,
        parent: &Arc<SecondaryTenant>,
-    ) -> (DiskUsageEvictionInfo, usize) {
-        let mut result = DiskUsageEvictionInfo::default();
-        let mut total_layers = 0;
-
+    ) -> DiskUsageEvictionInfo {
+        let mut result = DiskUsageEvictionInfo {
+            max_layer_size: None,
+            resident_layers: Vec::new(),
+        };
        for (timeline_id, timeline_detail) in &self.timelines {
            result
                .resident_layers
@@ -170,10 +169,6 @@ impl SecondaryDetail {
                        relative_last_activity: finite_f32::FiniteF32::ZERO,
                    }
                }));
-
-            // total might be missing currently downloading layers, but as a lower than actual
-            // value it is good enough approximation.
-            total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
        }
        result.max_layer_size = result
            .resident_layers
@@ -188,7 +183,7 @@ impl SecondaryDetail {
            result.resident_layers.len()
        );

-        (result, total_layers)
+        result
    }
 }

@@ -317,7 +312,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            .tenant_manager
            .get_secondary_tenant_shard(*tenant_shard_id);
        let Some(tenant) = tenant else {
-            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
        };

        Ok(PendingDownload {
@@ -392,9 +389,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            }

            CompleteDownload {
-                secondary_state,
-                completed_at: Instant::now(),
-            }
+                    secondary_state,
+                    completed_at: Instant::now(),
+                }
        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
    }
 }
@@ -533,7 +530,7 @@ impl<'a> TenantDownloader<'a> {
                    .map_err(UpdateError::from)?;
                let mut heatmap_bytes = Vec::new();
                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
                Ok(heatmap_bytes)
            },
            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -300,8 +300,8 @@ impl Layer {
        })
    }

-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset)
+    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset).await
    }

    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -612,10 +612,10 @@ impl LayerInner {
        let mut rx = self.status.subscribe();

        let strong = {
-            match self.inner.get() {
+            match self.inner.get_mut().await {
                Some(mut either) => {
                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
+                    ResidentOrWantedEvicted::downgrade(&mut either)
                }
                None => return Err(EvictionError::NotFound),
            }
@@ -641,7 +641,7 @@ impl LayerInner {
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
+                match self.inner.get_mut().await {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -759,7 +759,7 @@ impl LayerInner {
                // use the already held initialization permit because it is impossible to hit the
                // below paths anymore essentially limiting the max loop iterations to 2.
                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
+                let mut guard = self.inner.set(value, init_permit).await;
                let (strong, _upgraded) = guard
                    .get_and_upgrade()
                    .expect("init creates strong reference, we held the init permit");
@@ -767,7 +767,7 @@ impl LayerInner {
            }

            let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
+                let mut locked = self.inner.get_mut_or_init(download).await?;

                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                    if upgraded {
@@ -989,12 +989,12 @@ impl LayerInner {
        }
    }

-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

        // this is not accurate: we could have the file locally but there was a cancellation
        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let remote = self.inner.get_mut().await.is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -1053,7 +1053,7 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                    return;
                };
-                match this.evict_blocking(version) {
+                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                }
@@ -1061,7 +1061,7 @@ impl LayerInner {
        }
    }

-    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
        // deleted or detached timeline, don't do anything.
        let Some(timeline) = self.timeline.upgrade() else {
            return Err(EvictionCancelled::TimelineGone);
@@ -1070,7 +1070,7 @@ impl LayerInner {
        // to avoid starting a new download while we evict, keep holding on to the
        // permit.
        let _permit = {
-            let maybe_downloaded = self.inner.get();
+            let maybe_downloaded = self.inner.get_mut().await;

            let (_weak, permit) = match maybe_downloaded {
                Some(mut guard) => {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1268,7 +1268,7 @@ impl Timeline {
        let mut historic_layers = Vec::new();
        for historic_layer in layer_map.iter_historic_layers() {
            let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset));
+            historic_layers.push(historic_layer.info(reset).await);
        }

        LayerMapInfo {
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -314,9 +314,6 @@ lfc_change_limit_hook(int newval, void *extra)
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
-	if (new_size == 0) {
-		lfc_ctl->generation += 1;
-	}
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -11,23 +11,16 @@
 #include "postgres.h"
 #include "fmgr.h"

-#include "miscadmin.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
 #include "replication/walsender.h"
-#include "storage/procsignal.h"
-#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
-#include "utils/wait_event.h"

 #include "neon.h"
 #include "walproposer.h"
@@ -37,130 +30,6 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);

-static int	logical_replication_max_time_lag = 3600;
-
-static void
-InitLogicalReplicationMonitor(void)
-{
-	BackgroundWorker bgw;
-
-	DefineCustomIntVariable(
-		"neon.logical_replication_max_time_lag",
-		"Threshold for dropping unused logical replication slots",
-		NULL,
-		&logical_replication_max_time_lag,
-		3600, 0, INT_MAX,
-		PGC_SIGHUP,
-		GUC_UNIT_S,
-		NULL, NULL, NULL);
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
-typedef struct
-{
-	NameData    name;
-	bool        dropped;
-	XLogRecPtr  confirmed_flush_lsn;
-	TimestampTz last_updated;
-} SlotStatus;
-
-/*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
- */
-PGDLLEXPORT void
-LogicalSlotsMonitorMain(Datum main_arg)
-{
-	SlotStatus* slots;
-	TimestampTz now, last_checked;
-
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
-	last_checked = GetCurrentTimestamp();
-
-	for (;;)
-	{
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 logical_replication_max_time_lag*1000/2,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
-
-		now = GetCurrentTimestamp();
-
-		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
-		{
-			int n_active_slots = 0;
-			last_checked = now;
-
-			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-			for (int i = 0; i < max_replication_slots; i++)
-			{
-				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-
-				/* Consider only logical repliction slots */
-				if (!s->in_use || !SlotIsLogical(s))
-					continue;
-
-				if (s->active_pid != 0)
-				{
-					n_active_slots += 1;
-					continue;
-				}
-
-				/* Check if there was some activity with the slot since last check */
-				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
-				{
-					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
-					slots[i].last_updated = now;
-				}
-				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
-				{
-					slots[i].name = s->data.name;
-					slots[i].dropped = true;
-				}
-			}
-			LWLockRelease(ReplicationSlotControlLock);
-
-			/*
-			 * If there are no active subscriptions, then no new snapshots are generated
-			 * and so no need to force slot deletion.
-			 */
-			if (n_active_slots != 0)
-			{
-				for (int i = 0; i < max_replication_slots; i++)
-				{
-					if (slots[i].dropped)
-					{
-						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
-							 (now - slots[i].last_updated)/USECS_PER_SEC);
-						ReplicationSlotDrop(slots[i].name.data, true);
-						slots[i].dropped = false;
-					}
-				}
-			}
-		}
-	}
-}
-
 void
 _PG_init(void)
 {
@@ -175,8 +44,6 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();

-	InitLogicalReplicationMonitor();
-
 	InitControlPlaneConnector();

 	pg_init_extension_server();
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,7 +19,6 @@ chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
-env_logger.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -60,8 +59,6 @@ scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
-smol_str.workspace = true
-smallvec.workspace = true
 socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
@@ -78,7 +75,6 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
-urlencoding.workspace = true
 utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
@@ -87,6 +83,7 @@ native-tls.workspace = true
 postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
+smol_str.workspace = true

 workspace_hack.workspace = true

--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -5,8 +5,7 @@ pub use backend::BackendType;

 mod credentials;
 pub use credentials::{
-    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
-    ComputeUserInfoParseError, IpPattern,
+    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
 };

 mod password_hack;
@@ -15,12 +14,8 @@ use password_hack::PasswordHackPayload;

 mod flow;
 pub use flow::*;
-use tokio::time::error::Elapsed;

-use crate::{
-    console,
-    error::{ReportableError, UserFacingError},
-};
+use crate::{console, error::UserFacingError};
 use std::io;
 use thiserror::Error;

@@ -72,9 +67,6 @@ pub enum AuthErrorImpl {

    #[error("Too many connections to this endpoint. Please try again later.")]
    TooManyConnections,
-
-    #[error("Authentication timed out")]
-    UserTimeout(Elapsed),
 }

 #[derive(Debug, Error)]
@@ -101,10 +93,6 @@ impl AuthError {
    pub fn is_auth_failed(&self) -> bool {
        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
    }
-
-    pub fn user_timeout(elapsed: Elapsed) -> Self {
-        AuthErrorImpl::UserTimeout(elapsed).into()
-    }
 }

 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -128,27 +116,6 @@ impl UserFacingError for AuthError {
            Io(_) => "Internal error".to_string(),
            IpAddressNotAllowed => self.to_string(),
            TooManyConnections => self.to_string(),
-            UserTimeout(_) => self.to_string(),
-        }
-    }
-}
-
-impl ReportableError for AuthError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        use AuthErrorImpl::*;
-        match self.0.as_ref() {
-            Link(e) => e.get_error_kind(),
-            GetAuthInfo(e) => e.get_error_kind(),
-            WakeCompute(e) => e.get_error_kind(),
-            Sasl(e) => e.get_error_kind(),
-            AuthFailed(_) => crate::error::ErrorKind::User,
-            BadAuthMethod(_) => crate::error::ErrorKind::User,
-            MalformedPassword(_) => crate::error::ErrorKind::User,
-            MissingEndpointName => crate::error::ErrorKind::User,
-            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed => crate::error::ErrorKind::User,
-            TooManyConnections => crate::error::ErrorKind::RateLimit,
-            UserTimeout(_) => crate::error::ErrorKind::User,
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -68,7 +68,6 @@ pub trait TestBackend: Send + Sync + 'static {
    fn get_allowed_ips_and_secret(
        &self,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
-    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -359,17 +358,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 }

 impl BackendType<'_, ComputeUserInfo> {
-    pub async fn get_role_secret(
-        &self,
-        ctx: &mut RequestMonitoring,
-    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        use BackendType::*;
-        match self {
-            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(None)),
-        }
-    }
-
    pub async fn get_allowed_ips_and_secret(
        &self,
        ctx: &mut RequestMonitoring,
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -45,9 +45,9 @@ pub(super) async fn authenticate(
                }
            )
            .await
-            .map_err(|e| {
+            .map_err(|error| {
                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
-                auth::AuthError::user_timeout(e)
+                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
            })??;

            let client_key = match auth_outcome {
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -2,7 +2,7 @@ use crate::{
    auth, compute,
    console::{self, provider::NodeInfo},
    context::RequestMonitoring,
-    error::{ReportableError, UserFacingError},
+    error::UserFacingError,
    stream::PqStream,
    waiters,
 };
@@ -14,6 +14,10 @@ use tracing::{info, info_span};

 #[derive(Debug, Error)]
 pub enum LinkAuthError {
+    /// Authentication error reported by the console.
+    #[error("Authentication failed: {0}")]
+    AuthFailed(String),
+
    #[error(transparent)]
    WaiterRegister(#[from] waiters::RegisterError),

@@ -26,16 +30,10 @@ pub enum LinkAuthError {

 impl UserFacingError for LinkAuthError {
    fn to_string_client(&self) -> String {
-        "Internal error".to_string()
-    }
-}
-
-impl ReportableError for LinkAuthError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        use LinkAuthError::*;
        match self {
-            LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
-            LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
-            LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            AuthFailed(_) => self.to_string(),
+            _ => "Internal error".to_string(),
        }
    }
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,12 +1,8 @@
 //! User credentials used in authentication.

 use crate::{
-    auth::password_hack::parse_endpoint_param,
-    context::RequestMonitoring,
-    error::{ReportableError, UserFacingError},
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
-    proxy::NeonOptions,
-    serverless::SERVERLESS_DRIVER_SNI,
+    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
    EndpointId, RoleName,
 };
 use itertools::Itertools;
@@ -43,12 +39,6 @@ pub enum ComputeUserInfoParseError {

 impl UserFacingError for ComputeUserInfoParseError {}

-impl ReportableError for ComputeUserInfoParseError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        crate::error::ErrorKind::User
-    }
-}
-
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -167,7 +167,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
    }
 }

-pub(crate) fn validate_password_and_exchange(
+pub(super) fn validate_password_and_exchange(
    password: &[u8],
    secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -240,9 +240,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
                ?unexpected,
                "unexpected startup packet, rejecting connection"
            );
-            stream
-                .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User)
-                .await?
+            stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
        }
    }
 }
@@ -274,10 +272,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-
-    // doesn't yet matter as pg-sni-router doesn't report analytics logs
-    ctx.set_success();
-    ctx.log();
-
-    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
+    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -88,12 +88,6 @@ struct ProxyCliArgs {
    /// path to directory with TLS certificates for client postgres connections
    #[clap(long)]
    certs_dir: Option<String>,
-    /// timeout for the TLS handshake
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    handshake_timeout: tokio::time::Duration,
-    /// timeout for the control plane requests
-    #[clap(long, default_value = "70s", value_parser = humantime::parse_duration)]
-    cplane_timeout: tokio::time::Duration,
    /// http endpoint to receive periodic metric updates
    #[clap(long)]
    metric_collection_endpoint: Option<String>,
@@ -171,10 +165,6 @@ struct SqlOverHttpArgs {
    #[clap(long, default_value_t = 20)]
    sql_over_http_pool_max_conns_per_endpoint: usize,

-    /// How many connections to pool for each endpoint. Excess connections are discarded
-    #[clap(long, default_value_t = 20000)]
-    sql_over_http_pool_max_total_conns: usize,
-
    /// How long pooled connections should remain idle for before closing
    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
    sql_over_http_idle_timeout: tokio::time::Duration,
@@ -371,10 +361,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            tokio::spawn(locks.garbage_collect_worker(epoch));

            let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(
-                url,
-                http::new_client(rate_limiter_config, args.cplane_timeout),
-            );
+            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));

            let api = console::provider::neon::Api::new(endpoint, caches, locks);
            let api = console::provider::ConsoleBackend::Console(api);
@@ -400,7 +387,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
-            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
        },
    };
    let authentication_config = AuthenticationConfig {
@@ -420,7 +406,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        require_client_ip: args.require_client_ip,
        disable_ip_check_for_http: args.disable_ip_check_for_http,
        endpoint_rps_limit,
-        handshake_timeout: args.handshake_timeout,
        // TODO: add this argument
        region: args.region.clone(),
    }));
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,45 +1,24 @@
+use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
-use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;

-use crate::error::ReportableError;
-
 /// Enables serving `CancelRequest`s.
 #[derive(Default)]
 pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);

-#[derive(Debug, Error)]
-pub enum CancelError {
-    #[error("{0}")]
-    IO(#[from] std::io::Error),
-    #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
-}
-
-impl ReportableError for CancelError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self {
-            CancelError::IO(_) => crate::error::ErrorKind::Compute,
-            CancelError::Postgres(e) if e.as_db_error().is_some() => {
-                crate::error::ErrorKind::Postgres
-            }
-            CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
-        }
-    }
-}
-
 impl CancelMap {
    /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
+    pub async fn cancel_session(&self, key: CancelKeyData) -> anyhow::Result<()> {
        // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
-            tracing::warn!("query cancellation key not found: {key}");
-            return Ok(());
-        };
+        let cancel_closure = self
+            .0
+            .get(&key)
+            .and_then(|x| x.clone())
+            .with_context(|| format!("query cancellation key not found: {key}"))?;

        info!("cancelling query per user's request using key {key}");
        cancel_closure.try_cancel_query().await
@@ -102,7 +81,7 @@ impl CancelClosure {
    }

    /// Cancels the query running on user's compute node.
-    async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub async fn try_cancel_query(self) -> anyhow::Result<()> {
        let socket = TcpStream::connect(self.socket_addr).await?;
        self.cancel_token.cancel_query_raw(socket, NoTls).await?;

--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,10 +1,6 @@
 use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    console::errors::WakeComputeError,
-    context::RequestMonitoring,
-    error::{ReportableError, UserFacingError},
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
+    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
+    context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
    proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
@@ -62,20 +58,6 @@ impl UserFacingError for ConnectionError {
    }
 }

-impl ReportableError for ConnectionError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self {
-            ConnectionError::Postgres(e) if e.as_db_error().is_some() => {
-                crate::error::ErrorKind::Postgres
-            }
-            ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
-            ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
-            ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
-            ConnectionError::WakeComputeError(e) => e.get_error_kind(),
-        }
-    }
-}
-
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
 pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;

--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -22,7 +22,6 @@ pub struct ProxyConfig {
    pub disable_ip_check_for_http: bool,
    pub endpoint_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
-    pub handshake_timeout: Duration,
 }

 #[derive(Debug)]
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -20,7 +20,7 @@ use tracing::info;

 pub mod errors {
    use crate::{
-        error::{io_error, ReportableError, UserFacingError},
+        error::{io_error, UserFacingError},
        http,
        proxy::retry::ShouldRetry,
    };
@@ -81,15 +81,6 @@ pub mod errors {
        }
    }

-    impl ReportableError for ApiError {
-        fn get_error_kind(&self) -> crate::error::ErrorKind {
-            match self {
-                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
-                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
-            }
-        }
-    }
-
    impl ShouldRetry for ApiError {
        fn could_retry(&self) -> bool {
            match self {
@@ -159,16 +150,6 @@ pub mod errors {
            }
        }
    }
-
-    impl ReportableError for GetAuthInfoError {
-        fn get_error_kind(&self) -> crate::error::ErrorKind {
-            match self {
-                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
-                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
-            }
-        }
-    }
-
    #[derive(Debug, Error)]
    pub enum WakeComputeError {
        #[error("Console responded with a malformed compute address: {0}")]
@@ -213,16 +194,6 @@ pub mod errors {
            }
        }
    }
-
-    impl ReportableError for WakeComputeError {
-        fn get_error_kind(&self) -> crate::error::ErrorKind {
-            match self {
-                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
-            }
-        }
-    }
 }

 /// Auth secret which is managed by the cloud.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -188,7 +188,6 @@ impl super::Api for Api {
                ep,
                Arc::new(auth_info.allowed_ips),
            );
-            ctx.set_project_id(project_id);
        }
        // When we just got a secret, we don't need to invalidate it.
        Ok(Cached::new_uncached(auth_info.secret))
@@ -222,7 +221,6 @@ impl super::Api for Api {
            self.caches
                .project_info
                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
-            ctx.set_project_id(project_id);
        }
        Ok((
            Cached::new_uncached(allowed_ips),
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -8,10 +8,8 @@ use tokio::sync::mpsc;
 use uuid::Uuid;

 use crate::{
-    console::messages::MetricsAuxInfo,
-    error::ErrorKind,
-    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, EndpointId, ProjectId, RoleName,
+    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
+    EndpointId, ProjectId, RoleName,
 };

 pub mod parquet;
@@ -91,10 +89,6 @@ impl RequestMonitoring {
        self.project = Some(x.project_id);
    }

-    pub fn set_project_id(&mut self, project_id: ProjectId) {
-        self.project = Some(project_id);
-    }
-
    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
        crate::metrics::CONNECTING_ENDPOINTS
            .with_label_values(&[self.protocol])
@@ -110,18 +104,6 @@ impl RequestMonitoring {
        self.user = Some(user);
    }

-    pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        ERROR_BY_KIND
-            .with_label_values(&[kind.to_metric_label()])
-            .inc();
-        if let Some(ep) = &self.endpoint_id {
-            ENDPOINT_ERRORS_BY_KIND
-                .with_label_values(&[kind.to_metric_label()])
-                .measure(ep);
-        }
-        self.error_kind = Some(kind);
-    }
-
    pub fn set_success(&mut self) {
        self.success = true;
    }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -108,7 +108,7 @@ impl From<RequestMonitoring> for RequestData {
            branch: value.branch.as_deref().map(String::from),
            protocol: value.protocol,
            region: value.region,
-            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
+            error: value.error_kind.as_ref().map(|e| e.to_str()),
            success: value.success,
            duration_us: SystemTime::from(value.first_packet)
                .elapsed()
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -17,7 +17,7 @@ pub fn log_error<E: fmt::Display>(e: E) -> E {
 /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
 /// is way too convenient and tends to proliferate all across the codebase,
 /// ultimately leading to accidental leaks of sensitive data.
-pub trait UserFacingError: ReportableError {
+pub trait UserFacingError: fmt::Display {
    /// Format the error for client, stripping all sensitive info.
    ///
    /// Although this might be a no-op for many types, it's highly
@@ -29,13 +29,13 @@ pub trait UserFacingError: ReportableError {
    }
 }

-#[derive(Copy, Clone, Debug)]
+#[derive(Clone)]
 pub enum ErrorKind {
    /// Wrong password, unknown endpoint, protocol violation, etc...
    User,

    /// Network error between user and proxy. Not necessarily user error
-    ClientDisconnect,
+    Disconnect,

    /// Proxy self-imposed rate limits
    RateLimit,
@@ -46,9 +46,6 @@ pub enum ErrorKind {
    /// Error communicating with control plane
    ControlPlane,

-    /// Postgres error
-    Postgres,
-
    /// Error communicating with compute
    Compute,
 }
@@ -57,36 +54,11 @@ impl ErrorKind {
    pub fn to_str(&self) -> &'static str {
        match self {
            ErrorKind::User => "request failed due to user error",
-            ErrorKind::ClientDisconnect => "client disconnected",
+            ErrorKind::Disconnect => "client disconnected",
            ErrorKind::RateLimit => "request cancelled due to rate limit",
            ErrorKind::Service => "internal service error",
            ErrorKind::ControlPlane => "non-retryable control plane error",
-            ErrorKind::Postgres => "postgres error",
-            ErrorKind::Compute => {
-                "non-retryable compute connection error (or exhausted retry capacity)"
-            }
-        }
-    }
-
-    pub fn to_metric_label(&self) -> &'static str {
-        match self {
-            ErrorKind::User => "user",
-            ErrorKind::ClientDisconnect => "clientdisconnect",
-            ErrorKind::RateLimit => "ratelimit",
-            ErrorKind::Service => "service",
-            ErrorKind::ControlPlane => "controlplane",
-            ErrorKind::Postgres => "postgres",
-            ErrorKind::Compute => "compute",
+            ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)",
        }
    }
 }
-
-pub trait ReportableError: fmt::Display + Send + 'static {
-    fn get_error_kind(&self) -> ErrorKind;
-}
-
-impl ReportableError for tokio::time::error::Elapsed {
-    fn get_error_kind(&self) -> ErrorKind {
-        ErrorKind::RateLimit
-    }
-}
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -19,14 +19,10 @@ use reqwest_middleware::RequestBuilder;
 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
-pub fn new_client(
-    rate_limiter_config: rate_limiter::RateLimiterConfig,
-    timeout: Duration,
-) -> ClientWithMiddleware {
+pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
    let client = reqwest::ClientBuilder::new()
        .dns_resolver(Arc::new(GaiResolver::default()))
        .connection_verbose(true)
-        .timeout(timeout)
        .build()
        .expect("Failed to create http client");

--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,8 @@
 use ::metrics::{
    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
-    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
+    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
-use metrics::{register_int_counter_pair, IntCounterPair};

 use once_cell::sync::Lazy;
 use tokio::time;
@@ -114,44 +112,6 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
    .unwrap()
 });

-pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_conn_content_length_bytes",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(8.0, 2.0, 20).unwrap()
-    )
-    .unwrap()
-});
-
-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
-
-pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "proxy_http_pool_opened_connections",
-        "Number of opened connections to a database.",
-    )
-    .unwrap()
-});
-
 #[derive(Clone)]
 pub struct LatencyTimer {
    // time since the stopwatch was started
@@ -274,22 +234,3 @@ pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
    )
    .unwrap()
 });
-
-pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_errors_total",
-        "Number of errors by a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_endpoints_affected_by_errors",
-        "Number of endpoints affected by errors of a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -13,10 +13,9 @@ use crate::{
    compute,
    config::{ProxyConfig, TlsConfig},
    context::RequestMonitoring,
-    error::ReportableError,
    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
    protocol2::WithClientIp,
-    proxy::handshake::{handshake, HandshakeData},
+    proxy::{handshake::handshake, passthrough::proxy_pass},
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
    EndpointCacheKey,
@@ -29,17 +28,14 @@ use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
-use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};

-use self::{
-    connect_compute::{connect_to_compute, TcpMechanism},
-    passthrough::ProxyPassthrough,
-};
+use self::connect_compute::{connect_to_compute, TcpMechanism};

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
+const ERR_PROTO_VIOLATION: &str = "protocol violation";

 pub async fn run_until_cancelled<F: std::future::Future>(
    f: F,
@@ -102,14 +98,14 @@ pub async fn task_main(
                    bail!("missing required client IP");
                }

+                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+
                socket
                    .inner
                    .set_nodelay(true)
                    .context("failed to set socket option")?;

-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
-
-                let res = handle_client(
+                handle_client(
                    config,
                    &mut ctx,
                    cancel_map,
@@ -117,26 +113,7 @@ pub async fn task_main(
                    ClientMode::Tcp,
                    endpoint_rate_limiter,
                )
-                .await;
-
-                match res {
-                    Err(e) => {
-                        // todo: log and push to ctx the error kind
-                        ctx.set_error_kind(e.get_error_kind());
-                        ctx.log();
-                        Err(e.into())
-                    }
-                    Ok(None) => {
-                        ctx.set_success();
-                        ctx.log();
-                        Ok(())
-                    }
-                    Ok(Some(p)) => {
-                        ctx.set_success();
-                        ctx.log();
-                        p.proxy_pass().await
-                    }
-                }
+                .await
            }
            .unwrap_or_else(move |e| {
                // Acknowledge that the task has finished with an error.
@@ -192,37 +169,6 @@ impl ClientMode {
    }
 }

-#[derive(Debug, Error)]
-// almost all errors should be reported to the user, but there's a few cases where we cannot
-// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons
-// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
-//    we cannot be sure the client even understands our error message
-// 3. PrepareClient: The client disconnected, so we can't tell them anyway...
-pub enum ClientRequestError {
-    #[error("{0}")]
-    Cancellation(#[from] cancellation::CancelError),
-    #[error("{0}")]
-    Handshake(#[from] handshake::HandshakeError),
-    #[error("{0}")]
-    HandshakeTimeout(#[from] tokio::time::error::Elapsed),
-    #[error("{0}")]
-    PrepareClient(#[from] std::io::Error),
-    #[error("{0}")]
-    ReportedError(#[from] crate::stream::ReportedError),
-}
-
-impl ReportableError for ClientRequestError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self {
-            ClientRequestError::Cancellation(e) => e.get_error_kind(),
-            ClientRequestError::Handshake(e) => e.get_error_kind(),
-            ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit,
-            ClientRequestError::ReportedError(e) => e.get_error_kind(),
-            ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect,
-        }
-    }
-}
-
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
@@ -230,7 +176,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
+) -> anyhow::Result<()> {
    info!(
        protocol = ctx.protocol,
        "handling interactive connection from client"
@@ -247,17 +193,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let tls = config.tls_config.as_ref();

    let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls));
-    let (mut stream, params) =
-        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            HandshakeData::Startup(stream, params) => (stream, params),
-            HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancel_map
-                    .cancel_session(cancel_key_data)
-                    .await
-                    .map(|()| None)?)
-            }
-        };
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
+    let (mut stream, params) = match do_handshake.await? {
+        Some(x) => x,
+        None => return Ok(()), // it's a cancellation request
+    };
    drop(pause);

    let hostname = mode.hostname(stream.get_ref());
@@ -281,7 +221,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        if !endpoint_rate_limiter.check(ep) {
            return stream
                .throw_error(auth::AuthError::too_many_connections())
-                .await?;
+                .await;
        }
    }

@@ -301,7 +241,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
            let app = params.get("application_name");
            let params_span = tracing::info_span!("", ?user, ?db, ?app);

-            return stream.throw_error(e).instrument(params_span).await?;
+            return stream.throw_error(e).instrument(params_span).await;
        }
    };

@@ -327,13 +267,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let (stream, read_buf) = stream.into_inner();
    node.stream.write_all(&read_buf).await?;

-    Ok(Some(ProxyPassthrough {
-        client: stream,
-        compute: node,
-        aux,
-        req: _request_gauge,
-        conn: _client_gauge,
-    }))
+    proxy_pass(ctx, stream, node.stream, aux).await
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
@@ -342,7 +276,7 @@ async fn prepare_client_connection(
    node: &compute::PostgresConnection,
    session: &cancellation::Session,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> Result<(), std::io::Error> {
+) -> anyhow::Result<()> {
    // Register compute's query cancellation token and produce a new, unique one.
    // The new token (cancel_key_data) will be sent to the client.
    let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -34,6 +34,21 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
    node_info.invalidate().config
 }

+/// Try to connect to the compute node once.
+#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
+async fn connect_to_compute_once(
+    ctx: &mut RequestMonitoring,
+    node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
+) -> Result<PostgresConnection, compute::ConnectionError> {
+    let allow_self_signed_compute = node_info.allow_self_signed_compute;
+
+    node_info
+        .config
+        .connect(ctx, allow_self_signed_compute, timeout)
+        .await
+}
+
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -60,18 +75,13 @@ impl ConnectMechanism for TcpMechanism<'_> {
    type ConnectError = compute::ConnectionError;
    type Error = compute::ConnectionError;

-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
    async fn connect_once(
        &self,
        ctx: &mut RequestMonitoring,
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<PostgresConnection, Self::Error> {
-        let allow_self_signed_compute = node_info.allow_self_signed_compute;
-        node_info
-            .config
-            .connect(ctx, allow_self_signed_compute, timeout)
-            .await
+        connect_to_compute_once(ctx, node_info, timeout).await
    }

    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,60 +1,15 @@
-use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
-use thiserror::Error;
+use anyhow::{bail, Context};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;

 use crate::{
+    cancellation::CancelMap,
    config::TlsConfig,
-    error::ReportableError,
-    proxy::ERR_INSECURE_CONNECTION,
-    stream::{PqStream, Stream, StreamUpgradeError},
+    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
+    stream::{PqStream, Stream},
 };

-#[derive(Error, Debug)]
-pub enum HandshakeError {
-    #[error("data is sent before server replied with EncryptionResponse")]
-    EarlyData,
-
-    #[error("protocol violation")]
-    ProtocolViolation,
-
-    #[error("missing certificate")]
-    MissingCertificate,
-
-    #[error("{0}")]
-    StreamUpgradeError(#[from] StreamUpgradeError),
-
-    #[error("{0}")]
-    Io(#[from] std::io::Error),
-
-    #[error("{0}")]
-    ReportedError(#[from] crate::stream::ReportedError),
-}
-
-impl ReportableError for HandshakeError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self {
-            HandshakeError::EarlyData => crate::error::ErrorKind::User,
-            HandshakeError::ProtocolViolation => crate::error::ErrorKind::User,
-            // This error should not happen, but will if we have no default certificate and
-            // the client sends no SNI extension.
-            // If they provide SNI then we can be sure there is a certificate that matches.
-            HandshakeError::MissingCertificate => crate::error::ErrorKind::Service,
-            HandshakeError::StreamUpgradeError(upgrade) => match upgrade {
-                StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service,
-                StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            },
-            HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            HandshakeError::ReportedError(e) => e.get_error_kind(),
-        }
-    }
-}
-
-pub enum HandshakeData<S> {
-    Startup(PqStream<Stream<S>>, StartupMessageParams),
-    Cancel(CancelKeyData),
-}
-
 /// Establish a (most probably, secure) connection with the client.
 /// For better testing experience, `stream` can be any object satisfying the traits.
 /// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
@@ -63,7 +18,8 @@ pub enum HandshakeData<S> {
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mut tls: Option<&TlsConfig>,
-) -> Result<HandshakeData<S>, HandshakeError> {
+    cancel_map: &CancelMap,
+) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
    // Client may try upgrading to each protocol only once
    let (mut tried_ssl, mut tried_gss) = (false, false);

@@ -93,14 +49,14 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        // pipelining in our node js driver. We should probably
                        // support that by chaining read_buf with the stream.
                        if !read_buf.is_empty() {
-                            return Err(HandshakeError::EarlyData);
+                            bail!("data is sent before server replied with EncryptionResponse");
                        }
                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;

                        let (_, tls_server_end_point) = tls
                            .cert_resolver
                            .resolve(tls_stream.get_ref().1.server_name())
-                            .ok_or(HandshakeError::MissingCertificate)?;
+                            .context("missing certificate")?;

                        stream = PqStream::new(Stream::Tls {
                            tls: Box::new(tls_stream),
@@ -108,7 +64,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        });
                    }
                }
-                _ => return Err(HandshakeError::ProtocolViolation),
+                _ => bail!(ERR_PROTO_VIOLATION),
            },
            GssEncRequest => match stream.get_ref() {
                Stream::Raw { .. } if !tried_gss => {
@@ -117,23 +73,23 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                    // Currently, we don't support GSSAPI
                    stream.write_message(&Be::EncryptionResponse(false)).await?;
                }
-                _ => return Err(HandshakeError::ProtocolViolation),
+                _ => bail!(ERR_PROTO_VIOLATION),
            },
            StartupMessage { params, .. } => {
                // Check that the config has been consumed during upgrade
                // OR we didn't provide it at all (for dev purposes).
                if tls.is_some() {
-                    return stream
-                        .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
-                        .await?;
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
                }

                info!(session_type = "normal", "successful handshake");
-                break Ok(HandshakeData::Startup(stream, params));
+                break Ok(Some((stream, params)));
            }
            CancelRequest(cancel_key_data) => {
+                cancel_map.cancel_session(cancel_key_data).await?;
+
                info!(session_type = "cancellation", "successful handshake");
-                break Ok(HandshakeData::Cancel(cancel_key_data));
+                break Ok(None);
            }
        }
    }
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,11 +1,9 @@
 use crate::{
-    compute::PostgresConnection,
    console::messages::MetricsAuxInfo,
+    context::RequestMonitoring,
    metrics::NUM_BYTES_PROXIED_COUNTER,
-    stream::Stream,
    usage_metrics::{Ids, USAGE_METRICS},
 };
-use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
@@ -13,10 +11,14 @@ use utils::measured_stream::MeasuredStream;
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
 pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
    client: impl AsyncRead + AsyncWrite + Unpin,
    compute: impl AsyncRead + AsyncWrite + Unpin,
    aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
+    ctx.set_success();
+    ctx.log();
+
    let usage = USAGE_METRICS.register(Ids {
        endpoint_id: aux.endpoint_id.clone(),
        branch_id: aux.branch_id.clone(),
@@ -49,18 +51,3 @@ pub async fn proxy_pass(

    Ok(())
 }
-
-pub struct ProxyPassthrough<S> {
-    pub client: Stream<S>,
-    pub compute: PostgresConnection,
-    pub aux: MetricsAuxInfo,
-
-    pub req: IntCounterPairGuard,
-    pub conn: IntCounterPairGuard,
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
-    pub async fn proxy_pass(self) -> anyhow::Result<()> {
-        proxy_pass(self.client, self.compute.stream, self.aux).await
-    }
-}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -163,11 +163,11 @@ async fn dummy_proxy(
    tls: Option<TlsConfig>,
    auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
+    let cancel_map = CancelMap::default();
    let client = WithClientIp::new(client);
-    let mut stream = match handshake(client, tls.as_ref()).await? {
-        HandshakeData::Startup(stream, _) => stream,
-        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-    };
+    let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map)
+        .await?
+        .context("handshake failed")?;

    auth.authenticate(&mut stream).await?;

@@ -478,9 +478,6 @@ impl TestBackend for TestConnectMechanism {
    {
        unimplemented!("not used in tests")
    }
-    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
-        unimplemented!("not used in tests")
-    }
 }

 fn helper_create_cached_node_info() -> CachedNodeInfo {
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -35,10 +35,12 @@ async fn proxy_mitm(
    tokio::spawn(async move {
        // begin handshake with end_server
        let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
-            HandshakeData::Startup(stream, params) => (stream, params),
-            HandshakeData::Cancel(_) => panic!("cancellation not supported"),
-        };
+        // process handshake with end_client
+        let (end_client, startup) =
+            handshake(client1, Some(&server_config1), &CancelMap::default())
+                .await
+                .unwrap()
+                .unwrap();

        let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame);
        let (end_client, buf) = end_client.framed.into_inner();
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -10,7 +10,7 @@ mod channel_binding;
 mod messages;
 mod stream;

-use crate::error::{ReportableError, UserFacingError};
+use crate::error::UserFacingError;
 use std::io;
 use thiserror::Error;

@@ -48,18 +48,6 @@ impl UserFacingError for Error {
    }
 }

-impl ReportableError for Error {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self {
-            Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
-            Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
-            Error::BadClientMessage(_) => crate::error::ErrorKind::User,
-            Error::MissingBinding => crate::error::ErrorKind::Service,
-            Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
-        }
-    }
-}
-
 /// A convenient result type for SASL exchange.
 pub type Result<T> = std::result::Result<T, Error>;

--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -2,7 +2,6 @@
 //!
 //! Handles both SQL over HTTP and SQL over Websockets.

-mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
@@ -19,11 +18,11 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;

+use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
-use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
@@ -55,13 +54,12 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }

-    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
-    {
-        let conn_pool = Arc::clone(&conn_pool);
-        tokio::spawn(async move {
-            conn_pool.gc_worker(StdRng::from_entropy()).await;
-        });
-    }
+    let conn_pool = conn_pool::GlobalConnPool::new(config);
+
+    let conn_pool2 = Arc::clone(&conn_pool);
+    tokio::spawn(async move {
+        conn_pool2.gc_worker(StdRng::from_entropy()).await;
+    });

    // shutdown the connection pool
    tokio::spawn({
@@ -75,11 +73,6 @@ pub async fn task_main(
        }
    });

-    let backend = Arc::new(PoolingBackend {
-        pool: Arc::clone(&conn_pool),
-        config,
-    });
-
    let tls_config = match config.tls_config.as_ref() {
        Some(config) => config,
        None => {
@@ -109,10 +102,11 @@ pub async fn task_main(

    let make_svc = hyper::service::make_service_fn(
        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, _) = stream.get_ref();
+            let (io, tls) = stream.get_ref();
            let client_addr = io.client_addr();
            let remote_addr = io.inner.remote_addr();
-            let backend = backend.clone();
+            let sni_name = tls.server_name().map(|s| s.to_string());
+            let conn_pool = conn_pool.clone();
            let ws_connections = ws_connections.clone();
            let endpoint_rate_limiter = endpoint_rate_limiter.clone();

@@ -124,7 +118,8 @@ pub async fn task_main(
                };
                Ok(MetricService::new(hyper::service::service_fn(
                    move |req: Request<Body>| {
-                        let backend = backend.clone();
+                        let sni_name = sni_name.clone();
+                        let conn_pool = conn_pool.clone();
                        let ws_connections = ws_connections.clone();
                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();

@@ -135,10 +130,12 @@ pub async fn task_main(
                            request_handler(
                                req,
                                config,
-                                backend,
+                                tls_config,
+                                conn_pool,
                                ws_connections,
                                cancel_map,
                                session_id,
+                                sni_name,
                                peer_addr.ip(),
                                endpoint_rate_limiter,
                            )
@@ -203,10 +200,12 @@ where
 async fn request_handler(
    mut request: Request<Body>,
    config: &'static ProxyConfig,
-    backend: Arc<PoolingBackend>,
+    tls: &'static TlsConfig,
+    conn_pool: Arc<conn_pool::GlobalConnPool>,
    ws_connections: TaskTracker,
    cancel_map: Arc<CancelMap>,
    session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
    peer_addr: IpAddr,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
@@ -226,11 +225,11 @@ async fn request_handler(

        ws_connections.spawn(
            async move {
-                let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);

                if let Err(e) = websocket::serve_websocket(
                    config,
-                    ctx,
+                    &mut ctx,
                    websocket,
                    cancel_map,
                    host,
@@ -247,9 +246,17 @@ async fn request_handler(
        // Return the response so the spawned future can continue.
        Ok(response)
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);

-        sql_over_http::handle(config, ctx, request, backend).await
+        sql_over_http::handle(
+            tls,
+            &config.http_config,
+            &mut ctx,
+            request,
+            sni_hostname,
+            conn_pool,
+        )
+        .await
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
        Response::builder()
            .header("Allow", "OPTIONS, POST")
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,176 +0,0 @@
-use std::{sync::Arc, time::Duration};
-
-use async_trait::async_trait;
-use tracing::info;
-
-use crate::{
-    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
-    compute,
-    config::ProxyConfig,
-    console::{
-        errors::{GetAuthInfoError, WakeComputeError},
-        CachedNodeInfo,
-    },
-    context::RequestMonitoring,
-    proxy::connect_compute::ConnectMechanism,
-};
-
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
-
-pub struct PoolingBackend {
-    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
-    pub config: &'static ProxyConfig,
-}
-
-impl PoolingBackend {
-    pub async fn authenticate(
-        &self,
-        ctx: &mut RequestMonitoring,
-        conn_info: &ConnInfo,
-    ) -> Result<ComputeCredentialKeys, AuthError> {
-        let user_info = conn_info.user_info.clone();
-        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
-        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed());
-        }
-        let cached_secret = match maybe_secret {
-            Some(secret) => secret,
-            None => backend.get_role_secret(ctx).await?,
-        };
-
-        let secret = match cached_secret.value.clone() {
-            Some(secret) => secret,
-            None => {
-                // If we don't have an authentication secret, for the http flow we can just return an error.
-                info!("authentication info not found");
-                return Err(AuthError::auth_failed(&*user_info.user));
-            }
-        };
-        let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
-        match auth_outcome {
-            crate::sasl::Outcome::Success(key) => Ok(key),
-            crate::sasl::Outcome::Failure(reason) => {
-                info!("auth backend failed with an error: {reason}");
-                Err(AuthError::auth_failed(&*conn_info.user_info.user))
-            }
-        }
-    }
-
-    // Wake up the destination if needed. Code here is a bit involved because
-    // we reuse the code from the usual proxy and we need to prepare few structures
-    // that this code expects.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
-    pub async fn connect_to_compute(
-        &self,
-        ctx: &mut RequestMonitoring,
-        conn_info: ConnInfo,
-        keys: ComputeCredentialKeys,
-        force_new: bool,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
-        let maybe_client = if !force_new {
-            info!("pool: looking for an existing connection");
-            self.pool.get(ctx, &conn_info).await?
-        } else {
-            info!("pool: pool is disabled");
-            None
-        };
-
-        if let Some(client) = maybe_client {
-            return Ok(client);
-        }
-        let conn_id = uuid::Uuid::new_v4();
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        ctx.set_application(Some(APP_NAME));
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|_| conn_info.user_info.clone());
-
-        let mut node_info = backend
-            .wake_compute(ctx)
-            .await?
-            .ok_or(HttpConnError::NoComputeInfo)?;
-
-        match keys {
-            #[cfg(any(test, feature = "testing"))]
-            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
-            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
-        };
-
-        ctx.set_project(node_info.aux.clone());
-
-        crate::proxy::connect_compute::connect_to_compute(
-            ctx,
-            &TokioMechanism {
-                conn_id,
-                conn_info,
-                pool: self.pool.clone(),
-            },
-            node_info,
-            &backend,
-        )
-        .await
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum HttpConnError {
-    #[error("pooled connection closed at inconsistent state")]
-    ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
-    #[error("could not connection to compute")]
-    ConnectionError(#[from] tokio_postgres::Error),
-
-    #[error("could not get auth info")]
-    GetAuthInfo(#[from] GetAuthInfoError),
-    #[error("user not authenticated")]
-    AuthError(#[from] AuthError),
-    #[error("wake_compute returned error")]
-    WakeCompute(#[from] WakeComputeError),
-    #[error("wake_compute returned nothing")]
-    NoComputeInfo,
-}
-
-struct TokioMechanism {
-    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
-    conn_info: ConnInfo,
-    conn_id: uuid::Uuid,
-}
-
-#[async_trait]
-impl ConnectMechanism for TokioMechanism {
-    type Connection = Client<tokio_postgres::Client>;
-    type ConnectError = tokio_postgres::Error;
-    type Error = HttpConnError;
-
-    async fn connect_once(
-        &self,
-        ctx: &mut RequestMonitoring,
-        node_info: &CachedNodeInfo,
-        timeout: Duration,
-    ) -> Result<Self::Connection, Self::ConnectError> {
-        let mut config = (*node_info.config).clone();
-        let config = config
-            .user(&self.conn_info.user_info.user)
-            .password(&*self.conn_info.password)
-            .dbname(&self.conn_info.dbname)
-            .connect_timeout(timeout);
-
-        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
-
-        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
-        Ok(poll_client(
-            self.pool.clone(),
-            ctx,
-            self.conn_info.clone(),
-            client,
-            connection,
-            self.conn_id,
-            node_info.aux.clone(),
-        ))
-    }
-
-    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
-}
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -9,23 +9,23 @@ use tokio_postgres::Row;
 // as parameters.
 //
 pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
-    json.iter().map(json_value_to_pg_text).collect()
-}
+    json.iter()
+        .map(|value| {
+            match value {
+                // special care for nulls
+                Value::Null => None,

-fn json_value_to_pg_text(value: &Value) -> Option<String> {
-    match value {
-        // special care for nulls
-        Value::Null => None,
+                // convert to text with escaping
+                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),

-        // convert to text with escaping
-        v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
+                // avoid escaping here, as we pass this as a parameter
+                Value::String(s) => Some(s.to_string()),

-        // avoid escaping here, as we pass this as a parameter
-        Value::String(s) => Some(s.to_string()),
-
-        // special care for arrays
-        Value::Array(_) => json_array_to_pg_array(value),
-    }
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
 }

 //
@@ -60,20 +60,6 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum JsonConversionError {
-    #[error("internal error compute returned invalid data: {0}")]
-    AsTextError(tokio_postgres::Error),
-    #[error("parse int error: {0}")]
-    ParseIntError(#[from] std::num::ParseIntError),
-    #[error("parse float error: {0}")]
-    ParseFloatError(#[from] std::num::ParseFloatError),
-    #[error("parse json error: {0}")]
-    ParseJsonError(#[from] serde_json::Error),
-    #[error("unbalanced array")]
-    UnbalancedArray,
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
@@ -82,7 +68,7 @@ pub fn pg_text_row_to_json(
    columns: &[Type],
    raw_output: bool,
    array_mode: bool,
-) -> Result<Value, JsonConversionError> {
+) -> Result<Value, anyhow::Error> {
    let iter = row
        .columns()
        .iter()
@@ -90,7 +76,7 @@ pub fn pg_text_row_to_json(
        .enumerate()
        .map(|(i, (column, typ))| {
            let name = column.name();
-            let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?;
+            let pg_value = row.as_text(i)?;
            let json_value = if raw_output {
                match pg_value {
                    Some(v) => Value::String(v.to_string()),
@@ -106,10 +92,10 @@ pub fn pg_text_row_to_json(
        // drop keys and aggregate into array
        let arr = iter
            .map(|r| r.map(|(_key, val)| val))
-            .collect::<Result<Vec<Value>, JsonConversionError>>()?;
+            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
        Ok(Value::Array(arr))
    } else {
-        let obj = iter.collect::<Result<Map<String, Value>, JsonConversionError>>()?;
+        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
        Ok(Value::Object(obj))
    }
 }
@@ -117,7 +103,7 @@ pub fn pg_text_row_to_json(
 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, JsonConversionError> {
+fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
    if let Some(val) = pg_value {
        if let Kind::Array(elem_type) = pg_type.kind() {
            return pg_array_parse(val, elem_type);
@@ -156,7 +142,7 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, Json
 // values. Unlike postgres we don't check that all nested arrays have the same
 // dimensions, we just return them as is.
 //
-fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, JsonConversionError> {
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
 }

@@ -164,7 +150,7 @@ fn _pg_array_parse(
    pg_array: &str,
    elem_type: &Type,
    nested: bool,
-) -> Result<(Value, usize), JsonConversionError> {
+) -> Result<(Value, usize), anyhow::Error> {
    let mut pg_array_chr = pg_array.char_indices();
    let mut level = 0;
    let mut quote = false;
@@ -184,7 +170,7 @@ fn _pg_array_parse(
        entry: &mut String,
        entries: &mut Vec<Value>,
        elem_type: &Type,
-    ) -> Result<(), JsonConversionError> {
+    ) -> Result<(), anyhow::Error> {
        if !entry.is_empty() {
            // While in usual postgres response we get nulls as None and everything else
            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
@@ -248,7 +234,7 @@ fn _pg_array_parse(
    }

    if level != 0 {
-        return Err(JsonConversionError::UnbalancedArray);
+        return Err(anyhow::anyhow!("unbalanced array"));
    }

    Ok((Value::Array(entries), 0))
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;

 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -12,7 +13,6 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::join;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::GenericClient;
@@ -20,7 +20,6 @@ use tokio_postgres::IsolationLevel;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tracing::error;
-use tracing::info;
 use tracing::instrument;
 use url::Url;
 use utils::http::error::ApiError;
@@ -28,29 +27,22 @@ use utils::http::json::json_response;

 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
-use crate::auth::ComputeUserInfoParseError;
-use crate::config::ProxyConfig;
+use crate::config::HttpConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
-use crate::error::ReportableError;
-use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
 use crate::RoleName;

-use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
-use super::json::json_to_pg_text;
-use super::json::pg_text_row_to_json;
+use super::conn_pool::GlobalConnPool;
+use super::json::{json_to_pg_text, pg_text_row_to_json};
+use super::SERVERLESS_DRIVER_SNI;

 #[derive(serde::Deserialize)]
-#[serde(rename_all = "camelCase")]
 struct QueryData {
    query: String,
-    #[serde(deserialize_with = "bytes_to_pg_text")]
-    params: Vec<Option<String>>,
-    #[serde(default)]
-    array_mode: Option<bool>,
+    params: Vec<serde_json::Value>,
 }

 #[derive(serde::Deserialize)]
@@ -77,82 +69,67 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

-fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result<Vec<Option<String>>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    // TODO: consider avoiding the allocation here.
-    let json: Vec<Value> = serde::de::Deserialize::deserialize(deserializer)?;
-    Ok(json_to_pg_text(json))
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum ConnInfoError {
-    #[error("invalid header: {0}")]
-    InvalidHeader(&'static str),
-    #[error("invalid connection string: {0}")]
-    UrlParseError(#[from] url::ParseError),
-    #[error("incorrect scheme")]
-    IncorrectScheme,
-    #[error("missing database name")]
-    MissingDbName,
-    #[error("invalid database name")]
-    InvalidDbName,
-    #[error("missing username")]
-    MissingUsername,
-    #[error("invalid username: {0}")]
-    InvalidUsername(#[from] std::string::FromUtf8Error),
-    #[error("missing password")]
-    MissingPassword,
-    #[error("missing hostname")]
-    MissingHostname,
-    #[error("invalid hostname: {0}")]
-    InvalidEndpoint(#[from] ComputeUserInfoParseError),
-    #[error("malformed endpoint")]
-    MalformedEndpoint,
-}
-
 fn get_conn_info(
    ctx: &mut RequestMonitoring,
    headers: &HeaderMap,
+    sni_hostname: Option<String>,
    tls: &TlsConfig,
-) -> Result<ConnInfo, ConnInfoError> {
+) -> Result<ConnInfo, anyhow::Error> {
    let connection_string = headers
        .get("Neon-Connection-String")
-        .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
-        .to_str()
-        .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?;
+        .ok_or(anyhow::anyhow!("missing connection string"))?
+        .to_str()?;

    let connection_url = Url::parse(connection_string)?;

    let protocol = connection_url.scheme();
    if protocol != "postgres" && protocol != "postgresql" {
-        return Err(ConnInfoError::IncorrectScheme);
+        return Err(anyhow::anyhow!(
+            "connection string must start with postgres: or postgresql:"
+        ));
    }

    let mut url_path = connection_url
        .path_segments()
-        .ok_or(ConnInfoError::MissingDbName)?;
+        .ok_or(anyhow::anyhow!("missing database name"))?;

-    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
+    let dbname = url_path
+        .next()
+        .ok_or(anyhow::anyhow!("invalid database name"))?;

-    let username = RoleName::from(urlencoding::decode(connection_url.username())?);
+    let username = RoleName::from(connection_url.username());
    if username.is_empty() {
-        return Err(ConnInfoError::MissingUsername);
+        return Err(anyhow::anyhow!("missing username"));
    }
    ctx.set_user(username.clone());

    let password = connection_url
        .password()
-        .ok_or(ConnInfoError::MissingPassword)?;
-    let password = urlencoding::decode_binary(password.as_bytes());
+        .ok_or(anyhow::anyhow!("no password"))?;
+
+    // TLS certificate selector now based on SNI hostname, so if we are running here
+    // we are sure that SNI hostname is set to one of the configured domain names.
+    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;

    let hostname = connection_url
        .host_str()
-        .ok_or(ConnInfoError::MissingHostname)?;
+        .ok_or(anyhow::anyhow!("no host"))?;

-    let endpoint =
-        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
+    let host_header = headers
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next());
+
+    // sni_hostname has to be either the same as hostname or the one used in serverless driver.
+    if !check_matches(&sni_hostname, hostname)? {
+        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
+    } else if let Some(h) = host_header {
+        if h != sni_hostname {
+            return Err(anyhow::anyhow!("mismatched host header and hostname"));
+        }
+    }
+
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
    ctx.set_endpoint_id(endpoint.clone());

    let pairs = connection_url.query_pairs();
@@ -175,34 +152,41 @@ fn get_conn_info(
    Ok(ConnInfo {
        user_info,
        dbname: dbname.into(),
-        password: match password {
-            std::borrow::Cow::Borrowed(b) => b.into(),
-            std::borrow::Cow::Owned(b) => b.into(),
-        },
+        password: password.into(),
    })
 }

+fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Error> {
+    if sni_hostname == hostname {
+        return Ok(true);
+    }
+    let (sni_hostname_first, sni_hostname_rest) = sni_hostname
+        .split_once('.')
+        .ok_or_else(|| anyhow::anyhow!("Unexpected sni format."))?;
+    let (_, hostname_rest) = hostname
+        .split_once('.')
+        .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
+}
+
 // TODO: return different http error codes
 pub async fn handle(
-    config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    tls: &'static TlsConfig,
+    config: &'static HttpConfig,
+    ctx: &mut RequestMonitoring,
    request: Request<Body>,
-    backend: Arc<PoolingBackend>,
+    sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
 ) -> Result<Response<Body>, ApiError> {
    let result = tokio::time::timeout(
-        config.http_config.request_timeout,
-        handle_inner(config, &mut ctx, request, backend),
+        config.request_timeout,
+        handle_inner(tls, config, ctx, request, sni_hostname, conn_pool),
    )
    .await;
    let mut response = match result {
        Ok(r) => match r {
-            Ok(r) => {
-                ctx.set_success();
-                r
-            }
+            Ok(r) => r,
            Err(e) => {
-                // TODO: ctx.set_error_kind(e.get_error_type());
-
                let mut message = format!("{:?}", e);
                let db_error = e
                    .downcast_ref::<tokio_postgres::Error>()
@@ -278,12 +262,10 @@ pub async fn handle(
                )?
            }
        },
-        Err(e) => {
-            ctx.set_error_kind(e.get_error_kind());
-
+        Err(_) => {
            let message = format!(
                "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.http_config.request_timeout.as_secs()
+                config.request_timeout.as_secs()
            );
            error!(message);
            json_response(
@@ -292,7 +274,6 @@ pub async fn handle(
            )?
        }
    };
-
    response.headers_mut().insert(
        "Access-Control-Allow-Origin",
        hyper::http::HeaderValue::from_static("*"),
@@ -302,40 +283,32 @@ pub async fn handle(

 #[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
 async fn handle_inner(
-    config: &'static ProxyConfig,
+    tls: &'static TlsConfig,
+    config: &'static HttpConfig,
    ctx: &mut RequestMonitoring,
    request: Request<Body>,
-    backend: Arc<PoolingBackend>,
+    sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
 ) -> anyhow::Result<Response<Body>> {
    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[ctx.protocol])
+        .with_label_values(&["http"])
        .guard();
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );

    //
    // Determine the destination and connection params
    //
    let headers = request.headers();
-    // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
-    info!(
-        user = conn_info.user_info.user.as_str(),
-        project = conn_info.user_info.endpoint.as_str(),
-        "credentials"
-    );
+    let conn_info = get_conn_info(ctx, headers, sni_hostname, tls)?;

    // Determine the output options. Default behaviour is 'false'. Anything that is not
    // strictly 'true' assumed to be false.
    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);

    // Allow connection pooling only if explicitly requested
    // or if we have decided that http pool is no longer opt-in
-    let allow_pool = !config.http_config.pool_options.opt_in
-        || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool =
+        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

    // isolation level, read only and deferrable

@@ -360,8 +333,6 @@ async fn handle_inner(
        None => MAX_REQUEST_SIZE + 1,
    };
    drop(paused);
-    info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);

    // we don't have a streaming request support yet so this is to prevent OOM
    // from a malicious user sending an extremely large request body
@@ -371,28 +342,13 @@ async fn handle_inner(
        ));
    }

-    let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body())
-            .await
-            .map_err(anyhow::Error::from)?;
-        let payload: Payload = serde_json::from_slice(&body)?;
-        Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
-    };
+    //
+    // Read the query and query params from the request body
+    //
+    let body = hyper::body::to_bytes(request.into_body()).await?;
+    let payload: Payload = serde_json::from_slice(&body)?;

-    let authenticate_and_connect = async {
-        let keys = backend.authenticate(ctx, &conn_info).await?;
-        backend
-            .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-            .await
-    };
-
-    // Run both operations in parallel
-    let (payload_result, auth_and_connect_result) =
-        join!(fetch_and_process_request, authenticate_and_connect,);
-
-    // Handle the results
-    let payload = payload_result?; // Handle errors appropriately
-    let mut client = auth_and_connect_result?; // Handle errors appropriately
+    let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;

    let mut response = Response::builder()
        .status(StatusCode::OK)
@@ -402,88 +358,86 @@ async fn handle_inner(
    // Now execute the query and return the result
    //
    let mut size = 0;
-    let result = match payload {
-        Payload::Single(stmt) => {
-            let (status, results) =
-                query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
-                    .await
-                    .map_err(|e| {
-                        client.discard();
-                        e
-                    })?;
-            client.check_idle(status);
-            results
-        }
-        Payload::Batch(statements) => {
-            let (inner, mut discard) = client.inner();
-            let mut builder = inner.build_transaction();
-            if let Some(isolation_level) = txn_isolation_level {
-                builder = builder.isolation_level(isolation_level);
+    let result =
+        match payload {
+            Payload::Single(stmt) => {
+                let (status, results) =
+                    query_to_json(&*client, stmt, &mut 0, raw_output, array_mode)
+                        .await
+                        .map_err(|e| {
+                            client.discard();
+                            e
+                        })?;
+                client.check_idle(status);
+                results
            }
-            if txn_read_only {
-                builder = builder.read_only(true);
-            }
-            if txn_deferrable {
-                builder = builder.deferrable(true);
-            }
-
-            let transaction = builder.start().await.map_err(|e| {
-                // if we cannot start a transaction, we should return immediately
-                // and not return to the pool. connection is clearly broken
-                discard.discard();
-                e
-            })?;
-
-            let results = match query_batch(
-                &transaction,
-                statements,
-                &mut size,
-                raw_output,
-                default_array_mode,
-            )
-            .await
-            {
-                Ok(results) => {
-                    let status = transaction.commit().await.map_err(|e| {
-                        // if we cannot commit - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                        e
-                    })?;
-                    discard.check_idle(status);
-                    results
+            Payload::Batch(statements) => {
+                let (inner, mut discard) = client.inner();
+                let mut builder = inner.build_transaction();
+                if let Some(isolation_level) = txn_isolation_level {
+                    builder = builder.isolation_level(isolation_level);
                }
-                Err(err) => {
-                    let status = transaction.rollback().await.map_err(|e| {
-                        // if we cannot rollback - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                        e
-                    })?;
-                    discard.check_idle(status);
-                    return Err(err);
+                if txn_read_only {
+                    builder = builder.read_only(true);
+                }
+                if txn_deferrable {
+                    builder = builder.deferrable(true);
                }
-            };

-            if txn_read_only {
-                response = response.header(
-                    TXN_READ_ONLY.clone(),
-                    HeaderValue::try_from(txn_read_only.to_string())?,
-                );
-            }
-            if txn_deferrable {
-                response = response.header(
-                    TXN_DEFERRABLE.clone(),
-                    HeaderValue::try_from(txn_deferrable.to_string())?,
-                );
-            }
-            if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-            }
-            json!({ "results": results })
-        }
-    };
+                let transaction = builder.start().await.map_err(|e| {
+                    // if we cannot start a transaction, we should return immediately
+                    // and not return to the pool. connection is clearly broken
+                    discard.discard();
+                    e
+                })?;

+                let results =
+                    match query_batch(&transaction, statements, &mut size, raw_output, array_mode)
+                        .await
+                    {
+                        Ok(results) => {
+                            let status = transaction.commit().await.map_err(|e| {
+                                // if we cannot commit - for now don't return connection to pool
+                                // TODO: get a query status from the error
+                                discard.discard();
+                                e
+                            })?;
+                            discard.check_idle(status);
+                            results
+                        }
+                        Err(err) => {
+                            let status = transaction.rollback().await.map_err(|e| {
+                                // if we cannot rollback - for now don't return connection to pool
+                                // TODO: get a query status from the error
+                                discard.discard();
+                                e
+                            })?;
+                            discard.check_idle(status);
+                            return Err(err);
+                        }
+                    };
+
+                if txn_read_only {
+                    response = response.header(
+                        TXN_READ_ONLY.clone(),
+                        HeaderValue::try_from(txn_read_only.to_string())?,
+                    );
+                }
+                if txn_deferrable {
+                    response = response.header(
+                        TXN_DEFERRABLE.clone(),
+                        HeaderValue::try_from(txn_deferrable.to_string())?,
+                    );
+                }
+                if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                    response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+                }
+                json!({ "results": results })
+            }
+        };
+
+    ctx.set_success();
+    ctx.log();
    let metrics = client.metrics();

    // how could this possibly fail
@@ -526,9 +480,9 @@ async fn query_to_json<T: GenericClient>(
    data: QueryData,
    current_size: &mut usize,
    raw_output: bool,
-    default_array_mode: bool,
+    array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
-    let query_params = data.params;
+    let query_params = json_to_pg_text(data.params);
    let row_stream = client.query_raw_txt(&data.query, query_params).await?;

    // Manually drain the stream into a vector to leave row_stream hanging
@@ -580,8 +534,6 @@ async fn query_to_json<T: GenericClient>(
        columns.push(client.get_type(c.type_oid()).await?);
    }

-    let array_mode = data.array_mode.unwrap_or(default_array_mode);
-
    // convert rows to JSON
    let rows = rows
        .iter()
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -2,7 +2,7 @@ use crate::{
    cancellation::CancelMap,
    config::ProxyConfig,
    context::RequestMonitoring,
-    error::{io_error, ReportableError},
+    error::io_error,
    proxy::{handle_client, ClientMode},
    rate_limiter::EndpointRateLimiter,
 };
@@ -131,41 +131,23 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {

 pub async fn serve_websocket(
    config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    websocket: HyperWebsocket,
    cancel_map: Arc<CancelMap>,
    hostname: Option<String>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
    let websocket = websocket.await?;
-    let res = handle_client(
+    handle_client(
        config,
-        &mut ctx,
+        ctx,
        cancel_map,
        WebSocketRw::new(websocket),
        ClientMode::Websockets { hostname },
        endpoint_rate_limiter,
    )
-    .await;
-
-    match res {
-        Err(e) => {
-            // todo: log and push to ctx the error kind
-            ctx.set_error_kind(e.get_error_kind());
-            ctx.log();
-            Err(e.into())
-        }
-        Ok(None) => {
-            ctx.set_success();
-            ctx.log();
-            Ok(())
-        }
-        Ok(Some(p)) => {
-            ctx.set_success();
-            ctx.log();
-            p.proxy_pass().await
-        }
-    }
+    .await?;
+    Ok(())
 }

 #[cfg(test)]
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::config::TlsServerEndPoint;
-use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::error::UserFacingError;
+use anyhow::bail;
 use bytes::BytesMut;

 use pq_proto::framed::{ConnectionError, Framed};
@@ -72,30 +73,6 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
    }
 }

-#[derive(Debug)]
-pub struct ReportedError {
-    source: anyhow::Error,
-    error_kind: ErrorKind,
-}
-
-impl std::fmt::Display for ReportedError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.source.fmt(f)
-    }
-}
-
-impl std::error::Error for ReportedError {
-    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
-        self.source.source()
-    }
-}
-
-impl ReportableError for ReportedError {
-    fn get_error_kind(&self) -> ErrorKind {
-        self.error_kind
-    }
-}
-
 impl<S: AsyncWrite + Unpin> PqStream<S> {
    /// Write the message into an internal buffer, but don't flush the underlying stream.
    pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
@@ -121,52 +98,24 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
    /// Write the error message using [`Self::write_message`], then re-throw it.
    /// Allowing string literals is safe under the assumption they might not contain any runtime info.
    /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
-    pub async fn throw_error_str<T>(
-        &mut self,
-        msg: &'static str,
-        error_kind: ErrorKind,
-    ) -> Result<T, ReportedError> {
-        tracing::info!(
-            kind = error_kind.to_metric_label(),
-            msg,
-            "forwarding error to user"
-        );
-
-        // already error case, ignore client IO error
-        let _: Result<_, std::io::Error> = self
-            .write_message(&BeMessage::ErrorResponse(msg, None))
-            .await;
-
-        Err(ReportedError {
-            source: anyhow::anyhow!(msg),
-            error_kind,
-        })
+    pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
+        tracing::info!("forwarding error to user: {error}");
+        self.write_message(&BeMessage::ErrorResponse(error, None))
+            .await?;
+        bail!(error)
    }

    /// Write the error message using [`Self::write_message`], then re-throw it.
    /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
+    pub async fn throw_error<T, E>(&mut self, error: E) -> anyhow::Result<T>
    where
        E: UserFacingError + Into<anyhow::Error>,
    {
-        let error_kind = error.get_error_kind();
        let msg = error.to_string_client();
-        tracing::info!(
-            kind=error_kind.to_metric_label(),
-            error=%error,
-            msg,
-            "forwarding error to user"
-        );
-
-        // already error case, ignore client IO error
-        let _: Result<_, std::io::Error> = self
-            .write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await;
-
-        Err(ReportedError {
-            source: anyhow::anyhow!(error),
-            error_kind,
-        })
+        tracing::info!("forwarding error to user: {msg}");
+        self.write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await?;
+        bail!(error)
    }
 }

--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -237,7 +237,6 @@ mod tests {
    use std::{
        net::TcpListener,
        sync::{Arc, Mutex},
-        time::Duration,
    };

    use anyhow::Error;
@@ -280,7 +279,7 @@ mod tests {
        tokio::spawn(server);

        let metrics = Metrics::default();
-        let client = http::new_client(RateLimiterConfig::default(), Duration::from_secs(15));
+        let client = http::new_client(RateLimiterConfig::default());
        let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
        let now = Utc::now();

--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.76.0"
+channel = "1.75.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -10,7 +10,6 @@ use utils::id::NodeId;

 use std::cmp::min;
 use std::collections::{HashMap, HashSet};
-use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
@@ -547,10 +546,6 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
    let remote_path = RemotePath::new(&ttid_path)?;

-    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
-    // const Option unwrap is not stable, otherwise it would be const.
-    let batch_size: NonZeroU32 = NonZeroU32::new(1000).unwrap();
-
    // A backoff::retry is used here for two reasons:
    // - To provide a backoff rather than busy-polling the API on errors
    // - To absorb transient 429/503 conditions without hitting our error
@@ -562,26 +557,8 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    let token = CancellationToken::new(); // not really used
    backoff::retry(
        || async {
-            // Do list-delete in batch_size batches to make progress even if there a lot of files.
-            // Alternatively we could make list_files return iterator, but it is more complicated and
-            // I'm not sure deleting while iterating is expected in s3.
-            loop {
-                let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size))
-                    .await?;
-                if files.is_empty() {
-                    return Ok(()); // done
-                }
-                // (at least) s3 results are sorted, so can log min/max:
-                // "List results are always returned in UTF-8 binary order."
-                info!(
-                    "deleting batch of {} WAL segments [{}-{}]",
-                    files.len(),
-                    files.first().unwrap().object_name().unwrap_or(""),
-                    files.last().unwrap().object_name().unwrap_or("")
-                );
-                storage.delete_objects(&files).await?;
-            }
+            let files = storage.list_files(Some(&remote_path)).await?;
+            storage.delete_objects(&files).await
        },
        |_| false,
        3,
@@ -617,7 +594,7 @@ pub async fn copy_s3_segments(

    let remote_path = RemotePath::new(&relative_dst_path)?;

-    let files = storage.list_files(Some(&remote_path), None).await?;
+    let files = storage.list_files(Some(&remote_path)).await?;
    let uploaded_segments = &files
        .iter()
        .filter_map(|file| file.object_name().map(ToOwned::to_owned))
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -23,7 +23,7 @@ from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
-from urllib.parse import quote, urlparse
+from urllib.parse import urlparse

 import asyncpg
 import backoff
@@ -1816,7 +1816,6 @@ class NeonCli(AbstractNeonCli):
        endpoint_id: str,
        destroy=False,
        check_return_code=True,
-        mode: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1824,8 +1823,6 @@ class NeonCli(AbstractNeonCli):
        ]
        if destroy:
            args.append("--destroy")
-        if mode is not None:
-            args.append(f"--mode={mode}")
        if endpoint_id is not None:
            args.append(endpoint_id)

@@ -1952,15 +1949,6 @@ class NeonAttachmentService:

        return headers

-    def ready(self) -> bool:
-        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
-        if resp.status_code == 503:
-            return False
-        elif resp.status_code == 200:
-            return True
-        else:
-            raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
-
    def attach_hook_issue(
        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
    ) -> int:
@@ -2822,8 +2810,8 @@ class NeonProxy(PgProtocol):

    def http_query(self, query, args, **kwargs):
        # TODO maybe use default values if not provided
-        user = quote(kwargs["user"])
-        password = quote(kwargs["password"])
+        user = kwargs["user"]
+        password = kwargs["password"]
        expected_code = kwargs.get("expected_code")

        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
@@ -3165,7 +3153,7 @@ class Endpoint(PgProtocol):
        with open(remote_extensions_spec_path, "w") as file:
            json.dump(spec, file, indent=4)

-    def stop(self, mode: str = "fast") -> "Endpoint":
+    def stop(self) -> "Endpoint":
        """
        Stop the Postgres instance if it's running.
        Returns self.
@@ -3174,13 +3162,13 @@ class Endpoint(PgProtocol):
        if self.running:
            assert self.endpoint_id is not None
            self.env.neon_cli.endpoint_stop(
-                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
+                self.endpoint_id, check_return_code=self.check_stop_result
            )
            self.running = False

        return self

-    def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint":
+    def stop_and_destroy(self) -> "Endpoint":
        """
        Stop the Postgres instance, then destroy the endpoint.
        Returns self.
@@ -3188,7 +3176,7 @@ class Endpoint(PgProtocol):

        assert self.endpoint_id is not None
        self.env.neon_cli.endpoint_stop(
-            self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
+            self.endpoint_id, True, check_return_code=self.check_stop_result
        )
        self.endpoint_id = None
        self.running = False
@@ -3967,27 +3955,24 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:

 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
-
    # Get the timeline ID. We need it for the 'basebackup' command
    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
+
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
    # stop postgres to ensure that files won't change
    endpoint.stop()

-    # Read the shutdown checkpoint's LSN
-    pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
-    cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
-    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
-    checkpoint_lsn = re.findall(
-        "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
-    )[0]
-    log.debug(f"last checkpoint at {checkpoint_lsn}")
-
    # Take a basebackup from pageserver
    restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
    restored_dir_path.mkdir(exist_ok=True)

+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
    psql_path = os.path.join(pg_bin.pg_bin_path, "psql")

    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3995,7 +3980,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
        {psql_path}                                    \
            --no-psqlrc                                \
            postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
         | tar -x -C {restored_dir_path}
    """

@@ -4069,7 +4054,7 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -


 def tenant_get_shards(
-    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None
+    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int]
 ) -> list[tuple[TenantShardId, NeonPageserver]]:
    """
    Helper for when you want to talk to one or more pageservers, and the
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):

    endpoint.wait_for_migrations()

-    num_migrations = 4
+    num_migrations = 3

    with endpoint.cursor() as cur:
        cur.execute("SELECT id FROM neon_migration.migration_id")
@@ -24,7 +24,7 @@ def test_migrations(neon_simple_env: NeonEnv):

    with open(log_path, "r") as log_file:
        logs = log_file.read()
-        assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs
+        assert "INFO handle_migrations: Ran 3 migrations" in logs

    endpoint.stop()
    endpoint.start()
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -76,21 +76,3 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
            assert [r[0] for r in res] == [10, 20, 30, 40]

        wait_until(10, 0.5, check_that_changes_propagated)
-
-        # Test that pg_monitor is working for neon_superuser role
-        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
-        assert cur.fetchall()[0][0] != "<insufficient privilege>"
-        # Test that pg_monitor is not working for non neon_superuser role without grant
-        cur.execute("CREATE ROLE not_a_superuser LOGIN PASSWORD 'Password42!'")
-        cur.execute("GRANT not_a_superuser TO neon_superuser WITH ADMIN OPTION")
-        cur.execute("SET ROLE not_a_superuser")
-        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
-        assert cur.fetchall()[0][0] == "<insufficient privilege>"
-        cur.execute("RESET ROLE")
-        # Test that pg_monitor is working for non neon_superuser role with grant
-        cur.execute("GRANT pg_monitor TO not_a_superuser")
-        cur.execute("SET ROLE not_a_superuser")
-        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
-        assert cur.fetchall()[0][0] != "<insufficient privilege>"
-        cur.execute("RESET ROLE")
-        cur.execute("DROP ROLE not_a_superuser")
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -197,14 +197,6 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
    ##### Stop the first pageserver instance, erase all its data
    env.endpoints.stop_all()

-    # Stop safekeepers and take another checkpoint. The endpoints might
-    # have written a few more bytes during shutdown.
-    for sk in env.safekeepers:
-        sk.stop()
-
-    client.timeline_checkpoint(tenant_id, timeline_id)
-    current_lsn = Lsn(client.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
-
    # wait until pageserver has successfully uploaded all the data to remote storage
    wait_for_upload(client, tenant_id, timeline_id, current_lsn)

--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -390,47 +390,14 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
    assert result[0]["rows"] == [{"answer": 42}]


-def test_sql_over_http_batch_output_options(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create role http with login password 'http' superuser")
-
-    connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
-    response = requests.post(
-        f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-        data=json.dumps(
-            {
-                "queries": [
-                    {"query": "select $1 as answer", "params": [42], "arrayMode": True},
-                    {"query": "select $1 as answer", "params": [42], "arrayMode": False},
-                ]
-            }
-        ),
-        headers={
-            "Content-Type": "application/sql",
-            "Neon-Connection-String": connstr,
-            "Neon-Batch-Isolation-Level": "Serializable",
-            "Neon-Batch-Read-Only": "false",
-            "Neon-Batch-Deferrable": "false",
-        },
-        verify=str(static_proxy.test_output_dir / "proxy.crt"),
-    )
-    assert response.status_code == 200
-    results = response.json()["results"]
-
-    assert results[0]["rowAsArray"]
-    assert results[0]["rows"] == [["42"]]
-
-    assert not results[1]["rowAsArray"]
-    assert results[1]["rows"] == [{"answer": "42"}]
-
-
 def test_sql_over_http_pool(static_proxy: NeonProxy):
    static_proxy.safe_psql("create user http_auth with password 'http' superuser")

-    def get_pid(status: int, pw: str, user="http_auth") -> Any:
+    def get_pid(status: int, pw: str) -> Any:
        return static_proxy.http_query(
            GET_CONNECTION_PID_QUERY,
            [],
-            user=user,
+            user="http_auth",
            password=pw,
            expected_code=status,
        )
@@ -451,29 +418,23 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):

    static_proxy.safe_psql("alter user http_auth with password 'http2'")

-    # after password change, shouldn't open a new connection because it checks password in proxy.
-    rows = get_pid(200, "http2")["rows"]
-    assert rows == [{"pid": pid1}]
+    # after password change, should open a new connection to verify it
+    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid1 != pid2

    time.sleep(0.02)

-    # incorrect user shouldn't reveal that the user doesn't exists
-    res = get_pid(400, "http", user="http_auth2")
+    # query should be on an existing connection
+    pid = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid in [pid1, pid2]
+
+    time.sleep(0.02)
+
+    # old password should not work
+    res = get_pid(400, "http")
    assert "password authentication failed for user" in res["message"]


-def test_sql_over_http_urlencoding(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create user \"http+auth$$\" with password '%+$^&*@!' superuser")
-
-    static_proxy.http_query(
-        "select 1",
-        [],
-        user="http+auth$$",
-        password="%+$^&*@!",
-        expected_code=200,
-    )
-
-
 # Beginning a transaction should not impact the next query,
 # which might come from a completely different client.
 def test_http_pool_begin(static_proxy: NeonProxy):
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,10 +1,9 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
-    tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import TenantShardId, TimelineId
+from fixtures.types import TimelineId
 from fixtures.workload import Workload


@@ -83,160 +82,4 @@ def test_sharding_smoke(
        )
        assert timelines == {env.initial_timeline, timeline_b}

-
-def test_sharding_split_unsharded(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    Test that shard splitting works on a tenant created as unsharded (i.e. with
-    ShardCount(0)).
-    """
-    env = neon_env_builder.init_start()
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    # Check that we created with an unsharded TenantShardId: this is the default,
-    # but check it in case we change the default in future
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
-
-    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
-    workload.init()
-    workload.write_rows(256)
-    workload.validate()
-
-    # Split one shard into two
-    env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
-
-    # Check we got the shard IDs we expected
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 2)) is not None
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 1, 2)) is not None
-
-    workload.validate()
-
-
-def test_sharding_split_smoke(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    Test the basics of shard splitting:
-    - The API results in more shards than we started with
-    - The tenant's data remains readable
-
-    """
-
-    # We will start with 4 shards and split into 8, then migrate all those
-    # 8 shards onto separate pageservers
-    shard_count = 4
-    split_shard_count = 8
-    neon_env_builder.num_pageservers = split_shard_count
-
-    # 1MiB stripes: enable getting some meaningful data distribution without
-    # writing large quantities of data in this test.  The stripe size is given
-    # in number of 8KiB pages.
-    stripe_size = 128
-
-    # Use S3-compatible remote storage so that we can scrub: this test validates
-    # that the scrubber doesn't barf when it sees a sharded tenant.
-    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
-
-    neon_env_builder.preserve_database_files = True
-
-    env = neon_env_builder.init_start(
-        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
-    )
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
-    workload.init()
-
-    # Initial data
-    workload.write_rows(256)
-
-    # Note which pageservers initially hold a shard after tenant creation
-    pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
-
-    # For pageservers holding a shard, validate their ingest statistics
-    # reflect a proper splitting of the WAL.
-    for pageserver in env.pageservers:
-        if pageserver.id not in pre_split_pageserver_ids:
-            continue
-
-        metrics = pageserver.http_client().get_metrics_values(
-            [
-                "pageserver_wal_ingest_records_received_total",
-                "pageserver_wal_ingest_records_committed_total",
-                "pageserver_wal_ingest_records_filtered_total",
-            ]
-        )
-
-        log.info(f"Pageserver {pageserver.id} metrics: {metrics}")
-
-        # Not everything received was committed
-        assert (
-            metrics["pageserver_wal_ingest_records_received_total"]
-            > metrics["pageserver_wal_ingest_records_committed_total"]
-        )
-
-        # Something was committed
-        assert metrics["pageserver_wal_ingest_records_committed_total"] > 0
-
-        # Counts are self consistent
-        assert (
-            metrics["pageserver_wal_ingest_records_received_total"]
-            == metrics["pageserver_wal_ingest_records_committed_total"]
-            + metrics["pageserver_wal_ingest_records_filtered_total"]
-        )
-
-    # TODO: validate that shards have different sizes
-
-    workload.validate()
-
-    assert len(pre_split_pageserver_ids) == 4
-
-    env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
-
-    post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
-    # We should have split into 8 shards, on the same 4 pageservers we started on.
-    assert len(post_split_pageserver_ids) == split_shard_count
-    assert len(set(post_split_pageserver_ids)) == shard_count
-    assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
-
-    workload.validate()
-
-    workload.churn_rows(256)
-
-    workload.validate()
-
-    # Run GC on all new shards, to check they don't barf or delete anything that breaks reads
-    # (compaction was already run as part of churn_rows)
-    all_shards = tenant_get_shards(env, tenant_id)
-    for tenant_shard_id, pageserver in all_shards:
-        pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
-
-    # Restart all nodes, to check that the newly created shards are durable
-    for ps in env.pageservers:
-        ps.restart()
-
-    workload.validate()
-
-    migrate_to_pageserver_ids = list(
-        set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids)
-    )
-    assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count
-
-    # Migrate shards away from the node where the split happened
-    for ps_id in pre_split_pageserver_ids:
-        shards_here = [
-            tenant_shard_id
-            for (tenant_shard_id, pageserver) in all_shards
-            if pageserver.id == ps_id
-        ]
-        assert len(shards_here) == 2
-        migrate_shard = shards_here[0]
-        destination = migrate_to_pageserver_ids.pop()
-
-        log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
-        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
-
-    workload.validate()
+    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -128,38 +128,6 @@ def test_sharding_service_smoke(
    assert counts[env.pageservers[2].id] == tenant_shard_count // 2


-def test_node_status_after_restart(
-    neon_env_builder: NeonEnvBuilder,
-):
-    neon_env_builder.num_pageservers = 2
-    env = neon_env_builder.init_start()
-
-    # Initially we have two online pageservers
-    nodes = env.attachment_service.node_list()
-    assert len(nodes) == 2
-
-    env.pageservers[1].stop()
-
-    env.attachment_service.stop()
-    env.attachment_service.start()
-
-    # Initially readiness check should fail because we're trying to connect to the offline node
-    assert env.attachment_service.ready() is False
-
-    def is_ready():
-        assert env.attachment_service.ready() is True
-
-    wait_until(30, 1, is_ready)
-
-    # We loaded nodes from database on restart
-    nodes = env.attachment_service.node_list()
-    assert len(nodes) == 2
-
-    # We should still be able to create a tenant, because the pageserver which is still online
-    # should have had its availabilty state set to Active.
-    env.attachment_service.tenant_create(TenantId.generate())
-
-
 def test_sharding_service_passthrough(
    neon_env_builder: NeonEnvBuilder,
 ):
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -651,7 +651,9 @@ def test_timeline_delete_works_for_remote_smoke(
    timeline_ids = [env.initial_timeline]
    for i in range(2):
        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
-        with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur:
+        pg = env.endpoints.create_start(f"new{i}")
+
+        with pg.cursor() as cur:
            cur.execute("CREATE TABLE f (i integer);")
            cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,7 +1,6 @@
-import time
-
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
+from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn


 #
@@ -119,20 +118,12 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 # Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
 # record.
 #
-def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
-    tenant_conf = {
-        "checkpoint_distance": f"{128 * 1024}",
-        "compaction_target_size": f"{128 * 1024}",
-        "compaction_threshold": "1",
-        # create image layers eagerly, so that GC can remove some layers
-        "image_creation_threshold": "1",
-        # set PITR interval to be small, so we can do GC
-        "pitr_interval": "0 s",
-    }
-    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+# FIXME: This test is broken
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541")
+def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
+    env = neon_simple_env

-    tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
+    env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty")
    endpoint = env.endpoints.create_start(
        "test_vm_bit_clear_on_heap_lock",
        config_lines=[
@@ -148,88 +139,72 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):

    # Install extension containing function needed for test
    cur.execute("CREATE EXTENSION neon_test_utils")
-    cur.execute("CREATE EXTENSION pageinspect")
+
+    cur.execute("SELECT pg_switch_wal()")

    # Create a test table and freeze it to set the all-frozen VM bit on all pages.
    cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
    cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-
-    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")
+    cur.execute("VACUUM FREEZE vmtest_lock")

    # Lock a row. This clears the all-frozen VM bit for that page.
-    cur.execute("BEGIN")
    cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")

    # Remember the XID. We will use it later to verify that we have consumed a lot of
    # XIDs after this.
    cur.execute("select pg_current_xact_id()")
-    locking_xid = int(cur.fetchall()[0][0])
+    locking_xid = cur.fetchall()[0][0]

-    cur.execute("COMMIT")
-
-    # The VM page in shared buffer cache, and the same page as reconstructed
-    # by the pageserver, should be equal.
-    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[:100].hex()
-    cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[:100].hex()
-
-    assert vm_page_at_pageserver == vm_page_in_cache
-
-    # The above assert is enough to verify the bug that was fixed in
-    # commit 66fa176cc8. But for good measure, we also reproduce the
-    # original problem that the missing VM page update caused. The
-    # rest of the test does that.
-
-    # Kill and restart postgres, to clear the buffer cache.
+    # Stop and restart postgres, to clear the buffer cache.
    #
    # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
    # in a "clean" way. Our neon extension will write a full-page image of the VM
-    # page, and we want to avoid that. A clean shutdown will also not do, for the
-    # same reason.
-    endpoint.stop(mode="immediate")
-
+    # page, and we want to avoid that.
+    endpoint.stop()
    endpoint.start()
    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()

+    cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
+    tup = cur.fetchall()
+    xmax_before = tup[0][1]
+
    # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks
    # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very
    # low value, so it doesn't take all that many XIDs for autovacuum to kick in.
-    #
-    # We could use test_consume_xids() to consume XIDs much faster,
-    # but it wouldn't speed up the overall test, because we'd still
-    # need to wait for autovacuum to run.
-    for _ in range(1000):
-        cur.execute("select test_consume_xids(10000);")
-    for _ in range(1000):
+    for i in range(1000):
        cur.execute(
-            "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )"
+            """
+        CREATE TEMP TABLE othertable (i int) ON COMMIT DROP;
+        do $$
+        begin
+          for i in 1..100000 loop
+            -- Use a begin-exception block to generate a new subtransaction on each iteration
+            begin
+              insert into othertable values (i);
+            exception when others then
+              raise 'not expected %', sqlerrm;
+            end;
+          end loop;
+        end;
+        $$;
+        """
        )
-        page = (cur.fetchall()[0][0])[:100].hex()
-        log.info(f"VM page contents: {page}")
+        cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
+        tup = cur.fetchall()
+        log.info(f"tuple = {tup}")
+        xmax = tup[0][1]
+        assert xmax == xmax_before

-        cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-        page = (cur.fetchall()[0][0])[:100].hex()
-        log.info(f"VM page contents in cache: {page}")
-
-        cur.execute("select min(datfrozenxid::text::int) from pg_database")
-        datfrozenxid = int(cur.fetchall()[0][0])
-        log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}")
-        if datfrozenxid > locking_xid + 3000000:
-            break
-        time.sleep(0.5)
+        if i % 50 == 0:
+            cur.execute("select datfrozenxid from pg_database where datname='postgres'")
+            datfrozenxid = cur.fetchall()[0][0]
+            if datfrozenxid > locking_xid:
+                break

    cur.execute("select pg_current_xact_id()")
-    curr_xid = int(cur.fetchall()[0][0])
-    assert curr_xid - locking_xid >= 100000
-
-    # Perform GC in the pageserver. Otherwise the compute might still
-    # be able to download the already-deleted SLRU segment from the
-    # pageserver. That masks the original bug.
-    env.pageserver.http_client().timeline_checkpoint(tenant_id, timeline_id)
-    env.pageserver.http_client().timeline_compact(tenant_id, timeline_id)
-    env.pageserver.http_client().timeline_gc(tenant_id, timeline_id, 0)
+    curr_xid = cur.fetchall()[0][0]
+    assert int(curr_xid) - int(locking_xid) >= 100000

    # Now, if the VM all-frozen bit was not correctly cleared on
    # replay, we will try to fetch the status of the XID that was
@@ -239,4 +214,3 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
    cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
    tup = cur.fetchall()
    log.info(f"tuple = {tup}")
-    cur.execute("commit transaction")
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,6 +29,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+diesel = { version = "2", features = ["postgres", "r2d2", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -89,6 +90,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
+diesel_derives = { version = "2", features = ["32-column-tables", "postgres", "r2d2", "with-deprecated"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }