Merge pull request #6973 from neondatabase/rc/2024-02-29-manual

Release 2024-02-29
libs: fix expired token in auth decode test (#6963 )
2026-01-28 15:50:38 +00:00 · 2024-02-29 17:26:33 +00:00 · 2024-02-29 17:23:25 +00:00 · 2024-02-29 16:39:52 +00:00 · 2024-02-26 17:08:19 +00:00 · 2024-02-26 10:25:25 +00:00
106 changed files with 745 additions and 2312 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"

 [[package]]
 name = "ahash"
-version = "0.8.9"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
+checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
 "cfg-if",
 "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
 dependencies = [
 "rustc_version",
 ]
--- a/README.md
+++ b/README.md
@@ -230,8 +230,6 @@ postgres=# select * from t;
 > cargo neon stop
 ```

-More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
-
 #### Handling build failures

 If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,6 +18,8 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
+use tokio;
+use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Result;
+use anyhow::{self, Result};
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,6 +13,8 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use num_cpus;
+use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -1,26 +0,0 @@
-# Control Plane and Neon Local
-
-This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
-
-## Example: Start with Postgres 16
-
-To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
-
-```shell
-cargo neon init --pg-version 16
-cargo neon start
-cargo neon tenant create --set-default --pg-version 16
-cargo neon endpoint create main --pg-version 16
-cargo neon endpoint start main
-```
-
-## Example: Create Test User and Database
-
-By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
-
-```shell
-cargo neon endpoint create main --pg-version 16 --update-catalog true
-cargo neon endpoint start main --create-test-user true
-```
-
-The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -1,2 +0,0 @@
-ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
-ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -1,4 +0,0 @@
-
-
-ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
-ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,10 +1,9 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
@@ -118,14 +117,9 @@ async fn handle_tenant_create(
    check_permissions(&req, Scope::PageServerApi)?;

    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-
-    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-    // have no expectation of HA).
-    let placement_policy = PlacementPolicy::Single;
-
    json_response(
        StatusCode::CREATED,
-        service.tenant_create(create_req, placement_policy).await?,
+        service.tenant_create(create_req).await?,
    )
 }

@@ -191,27 +185,6 @@ async fn handle_tenant_location_config(
    )
 }

-async fn handle_tenant_config_set(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
-
-    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
-}
-
-async fn handle_tenant_config_get(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
-}
-
 async fn handle_tenant_time_travel_remote_storage(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -243,15 +216,7 @@ async fn handle_tenant_time_travel_remote_storage(
            done_if_after_raw,
        )
        .await?;
-    json_response(StatusCode::OK, ())
-}

-async fn handle_tenant_secondary_download(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
    json_response(StatusCode::OK, ())
 }

@@ -586,21 +551,12 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete)
        })
-        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
-        })
-        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
-        })
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
-        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
-        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -13,20 +13,14 @@ mod schema;
 pub mod service;
 mod tenant_state;

-#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
+#[derive(Clone, Serialize, Deserialize, Debug)]
 enum PlacementPolicy {
    /// Cheapest way to attach a tenant: just one pageserver, no secondary
    Single,
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
-    /// Create one secondary mode locations. This is useful when onboarding
-    /// a tenant, or for an idle tenant that we might want to bring online quickly.
-    Secondary,
-
-    /// Do not attach to any pageservers.  This is appropriate for tenants that
-    /// have been idle for a long time, where we do not mind some delay in making
-    /// them available in future.
+    /// Do not attach to any pageservers
    Detached,
 }

--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{BehaviorVersion, Region};
+use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -79,38 +79,13 @@ impl Secrets {
        "neon-storage-controller-control-plane-jwt-token";
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";

-    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
-    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
-    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
-    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
-
-    /// Load secrets from, in order of preference:
-    /// - CLI args if database URL is provided on the CLI
-    /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
        match &args.database_url {
            Some(url) => Self::load_cli(url, args),
-            None => match std::env::var(Self::DATABASE_URL_ENV) {
-                Ok(database_url) => Self::load_env(database_url),
-                Err(_) => Self::load_aws_sm().await,
-            },
+            None => Self::load_aws_sm().await,
        }
    }

-    fn load_env(database_url: String) -> anyhow::Result<Self> {
-        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
-            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
-            Err(_) => None,
-        };
-        Ok(Self {
-            database_url,
-            public_key,
-            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
-            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
-        })
-    }
-
    async fn load_aws_sm() -> anyhow::Result<Self> {
        let Ok(region) = std::env::var("AWS_REGION") else {
            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,10 +7,8 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::{
-    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
-    Selectable, SelectableHelper,
-};
+use diesel::prelude::*;
+use diesel::Connection;
 use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
@@ -333,15 +331,7 @@ impl Persistence {
                shard_number: ShardNumber(tsp.shard_number as u8),
                shard_count: ShardCount::new(tsp.shard_count as u8),
            };
-
-            let Some(g) = tsp.generation else {
-                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
-                // we only set generation_pageserver when setting generation.
-                return Err(DatabaseError::Logical(
-                    "Generation should always be set after incrementing".to_string(),
-                ));
-            };
-            result.insert(tenant_shard_id, Generation::new(g as u32));
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
        }

        Ok(result)
@@ -374,85 +364,7 @@ impl Persistence {
            })
            .await?;

-        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
-        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
-        debug_assert!(updated.generation.is_some());
-        let Some(g) = updated.generation else {
-            return Err(DatabaseError::Logical(
-                "Generation should always be set after incrementing".to_string(),
-            )
-            .into());
-        };
-
-        Ok(Generation::new(g as u32))
-    }
-
-    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
-    ///
-    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
-    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
-    /// that we only do the first time a tenant is set to an attached policy via /location_config.
-    pub(crate) async fn update_tenant_shard(
-        &self,
-        tenant_shard_id: TenantShardId,
-        input_placement_policy: PlacementPolicy,
-        input_config: TenantConfig,
-        input_generation: Option<Generation>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_conn(move |conn| {
-            let query = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
-
-            if let Some(input_generation) = input_generation {
-                // Update includes generation column
-                query
-                    .set((
-                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            } else {
-                // Update does not include generation column
-                query
-                    .set((
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            }
-
-            Ok(())
-        })
-        .await?;
-
-        Ok(())
-    }
-
-    pub(crate) async fn update_tenant_config(
-        &self,
-        input_tenant_id: TenantId,
-        input_config: TenantConfig,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_conn(move |conn| {
-            diesel::update(tenant_shards)
-                .filter(tenant_id.eq(input_tenant_id.to_string()))
-                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
-                .execute(conn)?;
-
-            Ok(())
-        })
-        .await?;
-
-        Ok(())
+        Ok(Generation::new(updated.generation as u32))
    }

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -463,7 +375,7 @@ impl Persistence {
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
+                    generation_pageserver.eq(i64::MAX),
                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                ))
                .execute(conn)?;
@@ -589,15 +501,12 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) shard_stripe_size: i32,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching.
-    //
-    // Generation is only None when first onboarding a tenant, where it may
-    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
-    pub(crate) generation: Option<i32>,
+    // and use the incremented number when attaching
+    pub(crate) generation: i32,

    // Currently attached pageserver
    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<i64>,
+    pub(crate) generation_pageserver: i64,

    #[serde(default)]
    pub(crate) placement_policy: String,
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Option<Generation>,
+    pub(crate) generation: Generation,
    pub(crate) intent: TargetState,
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedStale,
-            self.generation,
+            Some(self.generation),
            None,
        );
        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,17 +335,16 @@ impl Reconciler {
        }

        // Increment generation before attaching to new pageserver
-        self.generation = Some(
-            self.persistence
-                .increment_generation(self.tenant_shard_id, dest_ps_id)
-                .await?,
-        );
+        self.generation = self
+            .persistence
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
+            .await?;

        let dest_conf = build_location_config(
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedMulti,
-            self.generation,
+            Some(self.generation),
            None,
        );

@@ -402,7 +401,7 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedSingle,
-            self.generation,
+            Some(self.generation),
            None,
        );
        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -434,62 +433,22 @@ impl Reconciler {

        // If the attached pageserver is not attached, do so now.
        if let Some(node_id) = self.intent.attached {
-            // If we are in an attached policy, then generation must have been set (null generations
-            // are only present when a tenant is initially loaded with a secondary policy)
-            debug_assert!(self.generation.is_some());
-            let Some(generation) = self.generation else {
-                return Err(ReconcileError::Other(anyhow::anyhow!(
-                    "Attempted to attach with NULL generation"
-                )));
-            };
-
-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf =
+                attached_location_conf(self.generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
                    tracing::info!(%node_id, "Observed configuration already correct.")
                }
-                observed => {
+                _ => {
                    // In all cases other than a matching observed configuration, we will
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.
-
-                    // The general case is to increment the generation.  However, there are cases
-                    // where this is not necessary:
-                    // - if we are only updating the TenantConf part of the location
-                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
-                    //   and the location was already in the correct generation
-                    let increment_generation = match observed {
-                        None => true,
-                        Some(ObservedStateLocation { conf: None }) => true,
-                        Some(ObservedStateLocation {
-                            conf: Some(observed),
-                        }) => {
-                            let generations_match = observed.generation == wanted_conf.generation;
-
-                            use LocationConfigMode::*;
-                            let mode_transition_requires_gen_inc =
-                                match (observed.mode, wanted_conf.mode) {
-                                    // Usually the short-lived attachment modes (multi and stale) are only used
-                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
-                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
-                                    (AttachedSingle, AttachedStale) => false,
-                                    (AttachedMulti, AttachedSingle) => false,
-                                    (lhs, rhs) => lhs != rhs,
-                                };
-
-                            !generations_match || mode_transition_requires_gen_inc
-                        }
-                    };
-
-                    if increment_generation {
-                        let generation = self
-                            .persistence
-                            .increment_generation(self.tenant_shard_id, node_id)
-                            .await?;
-                        self.generation = Some(generation);
-                        wanted_conf.generation = generation.into();
-                    }
+                    self.generation = self
+                        .persistence
+                        .increment_generation(self.tenant_shard_id, node_id)
+                        .await?;
+                    wanted_conf.generation = self.generation.into();
                    tracing::info!(%node_id, "Observed configuration requires update.");
                    self.location_config(node_id, wanted_conf, None).await?;
                    self.compute_notify().await?;
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -284,6 +284,7 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use utils::id::NodeId;

    use crate::tenant_state::IntentState;
    #[test]
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
        shard_number -> Int4,
        shard_count -> Int4,
        shard_stripe_size -> Int4,
-        generation -> Nullable<Int4>,
-        generation_pageserver -> Nullable<Int8>,
+        generation -> Int4,
+        generation_pageserver -> Int8,
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -14,13 +14,10 @@ use control_plane::attachment_service::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
-use pageserver_api::{
-    controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
-    models::TenantConfigRequest,
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use pageserver_api::{
    models::{
@@ -68,11 +65,6 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 // some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

-// If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
-// tenant shards into this generation, and as long as it remains in this generation, we will accept
-// input generation from future requests as authoritative.
-const INITIAL_GENERATION: Generation = Generation::new(0);
-
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -175,21 +167,6 @@ impl From<ReconcileWaitError> for ApiError {
    }
 }

-#[allow(clippy::large_enum_variant)]
-enum TenantCreateOrUpdate {
-    Create((TenantCreateRequest, PlacementPolicy)),
-    Update(Vec<ShardUpdate>),
-}
-
-struct ShardUpdate {
-    tenant_shard_id: TenantShardId,
-    placement_policy: PlacementPolicy,
-    tenant_config: TenantConfig,
-
-    /// If this is None, generation is not updated.
-    generation: Option<Generation>,
-}
-
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -594,9 +571,6 @@ impl Service {
        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
        tenant.pending_compute_notification = result.pending_compute_notification;

-        // Let the TenantState know it is idle.
-        tenant.reconcile_complete(result.sequence);
-
        match result.result {
            Ok(()) => {
                for (node_id, loc) in &result.observed.locations {
@@ -687,8 +661,8 @@ impl Service {
            // after when pageservers start up and register.
            let mut node_ids = HashSet::new();
            for tsp in &tenant_shard_persistence {
-                if let Some(node_id) = tsp.generation_pageserver {
-                    node_ids.insert(node_id);
+                if tsp.generation_pageserver != i64::MAX {
+                    node_ids.insert(tsp.generation_pageserver);
                }
            }
            for node_id in node_ids {
@@ -725,15 +699,18 @@ impl Service {
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
-            if let Some(generation_pageserver) = tsp.generation_pageserver {
-                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
+            if tsp.generation_pageserver != i64::MAX {
+                intent.set_attached(
+                    &mut scheduler,
+                    Some(NodeId(tsp.generation_pageserver as u64)),
+                );
            }

            let new_tenant = TenantState {
                tenant_shard_id,
                shard: shard_identity,
                sequence: Sequence::initial(),
-                generation: tsp.generation.map(|g| Generation::new(g as u32)),
+                generation: Generation::new(tsp.generation as u32),
                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
                intent,
                observed: ObservedState::new(),
@@ -813,8 +790,8 @@ impl Service {
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: 0,
-                generation: Some(0),
-                generation_pageserver: None,
+                generation: 0,
+                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
@@ -869,7 +846,7 @@ impl Service {
            .expect("Checked for existence above");

        if let Some(new_generation) = new_generation {
-            tenant_state.generation = Some(new_generation);
+            tenant_state.generation = new_generation;
        } else {
            // This is a detach notification.  We must update placement policy to avoid re-attaching
            // during background scheduling/reconciliation, or during attachment service restart.
@@ -919,7 +896,7 @@ impl Service {
                    node_id,
                    ObservedStateLocation {
                        conf: Some(attached_location_conf(
-                            tenant_state.generation.unwrap(),
+                            tenant_state.generation,
                            &tenant_state.shard,
                            &tenant_state.config,
                        )),
@@ -933,7 +910,7 @@ impl Service {
        Ok(AttachHookResponse {
            gen: attach_req
                .node_id
-                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
+                .map(|_| tenant_state.generation.into().unwrap()),
        })
    }

@@ -946,7 +923,7 @@ impl Service {
            attachment: tenant_state.and_then(|s| {
                s.intent
                    .get_attached()
-                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
+                    .map(|ps| (s.generation.into().unwrap(), ps))
            }),
        }
    }
@@ -996,17 +973,7 @@ impl Service {
                continue;
            };

-            // If [`Persistence::re_attach`] selected this shard, it must have alread
-            // had a generation set.
-            debug_assert!(shard_state.generation.is_some());
-            let Some(old_gen) = shard_state.generation else {
-                // Should never happen:  would only return incremented generation
-                // for a tenant that already had a non-null generation.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Generation must be set while re-attaching"
-                )));
-            };
-            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
+            shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
            if let Some(observed) = shard_state
                .observed
                .locations
@@ -1036,7 +1003,7 @@ impl Service {

        for req_tenant in validate_req.tenants {
            if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
+                let valid = tenant_state.generation == Generation::new(req_tenant.gen);
                tracing::info!(
                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                    req_tenant.id,
@@ -1063,9 +1030,8 @@ impl Service {
    pub(crate) async fn tenant_create(
        &self,
        create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
    ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
+        let (response, waiters) = self.do_tenant_create(create_req).await?;

        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
        Ok(response)
@@ -1074,7 +1040,6 @@ impl Service {
    pub(crate) async fn do_tenant_create(
        &self,
        create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
@@ -1100,27 +1065,9 @@ impl Service {
            })
            .collect::<Vec<_>>();

-        // If the caller specifies a None generation, it means "start from default".  This is different
-        // to [`Self::tenant_location_config`], where a None generation is used to represent
-        // an incompletely-onboarded tenant.
-        let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
-            tracing::info!(
-                "tenant_create: secondary mode, generation is_some={}",
-                create_req.generation.is_some()
-            );
-            create_req.generation.map(Generation::new)
-        } else {
-            tracing::info!(
-                "tenant_create: not secondary mode, generation is_some={}",
-                create_req.generation.is_some()
-            );
-            Some(
-                create_req
-                    .generation
-                    .map(Generation::new)
-                    .unwrap_or(INITIAL_GENERATION),
-            )
-        };
+        // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
+        // have no expectation of HA).
+        let placement_policy: PlacementPolicy = PlacementPolicy::Single;

        // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
        // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1132,10 +1079,8 @@ impl Service {
                shard_number: tenant_shard_id.shard_number.0 as i32,
                shard_count: tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: initial_generation.map(|g| g.into().unwrap() as i32),
-                // The pageserver is not known until scheduling happens: we will set this column when
-                // incrementing the generation the first time we attach to a pageserver.
-                generation_pageserver: None,
+                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
+                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
@@ -1175,17 +1120,15 @@ impl Service {
                            ))
                        })?;

-                        if let Some(node_id) = entry.get().intent.get_attached() {
-                            let generation = entry
+                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
+                            node_id: entry
                                .get()
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
+                                .intent
+                                .get_attached()
+                                .expect("We just set pageserver if it was None"),
+                            generation: entry.get().generation.into().unwrap(),
+                        });

                        continue;
                    }
@@ -1199,7 +1142,9 @@ impl Service {
                            placement_policy.clone(),
                        );

-                        state.generation = initial_generation;
+                        if let Some(create_gen) = create_req.generation {
+                            state.generation = Generation::new(create_gen);
+                        }
                        state.config = create_req.config.clone();

                        state.schedule(scheduler).map_err(|e| {
@@ -1208,18 +1153,14 @@ impl Service {
                            ))
                        })?;

-                        // Only include shards in result if we are attaching: the purpose
-                        // of the response is to tell the caller where the shards are attached.
-                        if let Some(node_id) = state.intent.get_attached() {
-                            let generation = state
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
+                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
+                            node_id: state
+                                .intent
+                                .get_attached()
+                                .expect("We just set pageserver if it was None"),
+                            generation: state.generation.into().unwrap(),
+                        });
                        entry.insert(state)
                    }
                };
@@ -1273,114 +1214,12 @@ impl Service {
        Ok(())
    }

-    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
-    /// and transform it into either a tenant creation of a series of shard updates.
-    fn tenant_location_config_prepare(
-        &self,
-        tenant_id: TenantId,
-        req: TenantLocationConfigRequest,
-    ) -> TenantCreateOrUpdate {
-        let mut updates = Vec::new();
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
-
-        // Use location config mode as an indicator of policy.
-        let placement_policy = match req.config.mode {
-            LocationConfigMode::Detached => PlacementPolicy::Detached,
-            LocationConfigMode::Secondary => PlacementPolicy::Secondary,
-            LocationConfigMode::AttachedMulti
-            | LocationConfigMode::AttachedSingle
-            | LocationConfigMode::AttachedStale => {
-                if nodes.len() > 1 {
-                    PlacementPolicy::Double(1)
-                } else {
-                    // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into Single mode so that scheduling will succeed.
-                    PlacementPolicy::Single
-                }
-            }
-        };
-
-        let mut create = true;
-        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            // Saw an existing shard: this is not a creation
-            create = false;
-
-            // Shards may have initially been created by a Secondary request, where we
-            // would have left generation as None.
-            //
-            // We only update generation the first time we see an attached-mode request,
-            // and if there is no existing generation set. The caller is responsible for
-            // ensuring that no non-storage-controller pageserver ever uses a higher
-            // generation than they passed in here.
-            use LocationConfigMode::*;
-            let set_generation = match req.config.mode {
-                AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
-                    req.config.generation.map(Generation::new)
-                }
-                _ => None,
-            };
-
-            if shard.policy != placement_policy
-                || shard.config != req.config.tenant_conf
-                || set_generation.is_some()
-            {
-                updates.push(ShardUpdate {
-                    tenant_shard_id: *shard_id,
-                    placement_policy: placement_policy.clone(),
-                    tenant_config: req.config.tenant_conf.clone(),
-                    generation: set_generation,
-                });
-            }
-        }
-
-        if create {
-            use LocationConfigMode::*;
-            let generation = match req.config.mode {
-                AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
-                // If a caller provided a generation in a non-attached request, ignore it
-                // and leave our generation as None: this enables a subsequent update to set
-                // the generation when setting an attached mode for the first time.
-                _ => None,
-            };
-
-            TenantCreateOrUpdate::Create(
-                // Synthesize a creation request
-                (
-                    TenantCreateRequest {
-                        new_tenant_id: TenantShardId::unsharded(tenant_id),
-                        generation,
-                        shard_parameters: ShardParameters {
-                            // Must preserve the incoming shard_count do distinguish unsharded (0)
-                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                            count: req.tenant_id.shard_count,
-                            // We only import un-sharded or single-sharded tenants, so stripe
-                            // size can be made up arbitrarily here.
-                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                        },
-                        config: req.config.tenant_conf,
-                    },
-                    placement_policy,
-                ),
-            )
-        } else {
-            TenantCreateOrUpdate::Update(updates)
-        }
-    }
-
-    /// This API is used by the cloud control plane to migrate unsharded tenants that it created
-    /// directly with pageservers into this service.
-    ///
-    /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
-    /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
-    /// Think of the first attempt to call this API as a transfer of absolute authority over the
-    /// tenant's source of generation numbers.
-    ///
-    /// The mode in this request coarse-grained control of tenants:
+    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
    /// - Call with mode Attached* to upsert the tenant.
-    /// - Call with mode Secondary to either onboard a tenant without attaching it, or
-    ///   to set an existing tenant to PolicyMode::Secondary
    /// - Call with mode Detached to switch to PolicyMode::Detached
+    ///
+    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
+    /// secondary locations.
    pub(crate) async fn tenant_location_config(
        &self,
        tenant_id: TenantId,
@@ -1392,96 +1231,131 @@ impl Service {
            )));
        }

-        // First check if this is a creation or an update
-        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
-
+        let mut waiters = Vec::new();
        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
-        let waiters = match create_or_update {
-            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
-                let (create_resp, waiters) =
-                    self.do_tenant_create(create_req, placement_policy).await?;
-                result.shards = create_resp
-                    .shards
-                    .into_iter()
-                    .map(|s| TenantShardLocation {
-                        node_id: s.node_id,
-                        shard_id: s.shard_id,
-                    })
-                    .collect();
-                waiters
-            }
-            TenantCreateOrUpdate::Update(updates) => {
-                // Persist updates
-                // Ordering: write to the database before applying changes in-memory, so that
-                // we will not appear time-travel backwards on a restart.
-                for ShardUpdate {
-                    tenant_shard_id,
-                    placement_policy,
-                    tenant_config,
-                    generation,
-                } in &updates
-                {
-                    self.persistence
-                        .update_tenant_shard(
-                            *tenant_shard_id,
-                            placement_policy.clone(),
-                            tenant_config.clone(),
-                            *generation,
-                        )
-                        .await?;
-                }
+        let maybe_create = {
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();

-                // Apply updates in-memory
-                let mut waiters = Vec::new();
-                {
-                    let mut locked = self.inner.write().unwrap();
-                    let result_tx = locked.result_tx.clone();
-                    let compute_hook = locked.compute_hook.clone();
-                    let (nodes, tenants, scheduler) = locked.parts_mut();
+            // Maybe we have existing shards
+            let mut create = true;
+            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+                // Saw an existing shard: this is not a creation
+                create = false;

-                    for ShardUpdate {
-                        tenant_shard_id,
-                        placement_policy,
-                        tenant_config,
-                        generation: update_generation,
-                    } in updates
-                    {
-                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
-                            tracing::warn!("Shard {tenant_shard_id} removed while updating");
-                            continue;
-                        };
+                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
+                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
+                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
+                // cloud control plane into this service.

-                        shard.policy = placement_policy;
-                        shard.config = tenant_config;
-                        if let Some(generation) = update_generation {
-                            shard.generation = Some(generation);
-                        }
-
-                        shard.schedule(scheduler)?;
-
-                        let maybe_waiter = shard.maybe_reconcile(
-                            result_tx.clone(),
-                            nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
-                        if let Some(waiter) = maybe_waiter {
-                            waiters.push(waiter);
-                        }
-
-                        if let Some(node_id) = shard.intent.get_attached() {
-                            result.shards.push(TenantShardLocation {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                            })
+                // Use location config mode as an indicator of policy: if they ask for
+                // attached we go to default HA attached mode.  If they ask for secondary
+                // we go to secondary-only mode.  If they ask for detached we detach.
+                match req.config.mode {
+                    LocationConfigMode::Detached => {
+                        shard.policy = PlacementPolicy::Detached;
+                    }
+                    LocationConfigMode::Secondary => {
+                        // TODO: implement secondary-only mode.
+                        todo!();
+                    }
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // TODO: persistence for changes in policy
+                        if nodes.len() > 1 {
+                            shard.policy = PlacementPolicy::Double(1)
+                        } else {
+                            // Convenience for dev/test: if we just have one pageserver, import
+                            // tenants into Single mode so that scheduling will succeed.
+                            shard.policy = PlacementPolicy::Single
                        }
                    }
                }
-                waiters
+
+                shard.schedule(scheduler)?;
+
+                let maybe_waiter = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    nodes,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
+                );
+                if let Some(waiter) = maybe_waiter {
+                    waiters.push(waiter);
+                }
+
+                if let Some(node_id) = shard.intent.get_attached() {
+                    result.shards.push(TenantShardLocation {
+                        shard_id: *shard_id,
+                        node_id: *node_id,
+                    })
+                }
            }
+
+            if create {
+                // Validate request mode
+                match req.config.mode {
+                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                        // When using this API to onboard an existing tenant to this service, it must start in
+                        // an attached state, because we need the request to come with a generation
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Imported tenant must be in attached mode"
+                        )));
+                    }
+
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // Pass
+                    }
+                }
+
+                // Validate request generation
+                let Some(generation) = req.config.generation else {
+                    // We can only import attached tenants, because we need the request to come with a generation
+                    return Err(ApiError::BadRequest(anyhow::anyhow!(
+                        "Generation is mandatory when importing tenant"
+                    )));
+                };
+
+                // Synthesize a creation request
+                Some(TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: Some(generation),
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                    },
+                    config: req.config.tenant_conf,
+                })
+            } else {
+                None
+            }
+        };
+
+        let waiters = if let Some(create_req) = maybe_create {
+            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
+            result.shards = create_resp
+                .shards
+                .into_iter()
+                .map(|s| TenantShardLocation {
+                    node_id: s.node_id,
+                    shard_id: s.shard_id,
+                })
+                .collect();
+            waiters
+        } else {
+            waiters
        };

        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1501,91 +1375,6 @@ impl Service {
        Ok(result)
    }

-    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
-        let tenant_id = req.tenant_id;
-        let config = req.config;
-
-        self.persistence
-            .update_tenant_config(req.tenant_id, config.clone())
-            .await?;
-
-        let waiters = {
-            let mut waiters = Vec::new();
-            let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-            let (nodes, tenants, _scheduler) = locked.parts_mut();
-            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-                shard.config = config.clone();
-                if let Some(waiter) = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                ) {
-                    waiters.push(waiter);
-                }
-            }
-            waiters
-        };
-
-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
-            // Treat this as success because we have stored the configuration.  If e.g.
-            // a node was unavailable at this time, it should not stop us accepting a
-            // configuration change.
-            tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn tenant_config_get(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
-        let config = {
-            let locked = self.inner.read().unwrap();
-
-            match locked
-                .tenants
-                .range(TenantShardId::tenant_range(tenant_id))
-                .next()
-            {
-                Some((_tenant_shard_id, shard)) => shard.config.clone(),
-                None => {
-                    return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Tenant not found").into(),
-                    ))
-                }
-            }
-        };
-
-        // Unlike the pageserver, we do not have a set of global defaults: the config is
-        // entirely per-tenant.  Therefore the distinction between `tenant_specific_overrides`
-        // and `effective_config` in the response is meaningless, but we retain that syntax
-        // in order to remain compatible with the pageserver API.
-
-        let response = HashMap::from([
-            (
-                "tenant_specific_overrides",
-                serde_json::to_value(&config)
-                    .context("serializing tenant specific overrides")
-                    .map_err(ApiError::InternalServerError)?,
-            ),
-            (
-                "effective_config",
-                serde_json::to_value(&config)
-                    .context("serializing effective config")
-                    .map_err(ApiError::InternalServerError)?,
-            ),
-        ]);
-
-        Ok(response)
-    }
-
    pub(crate) async fn tenant_time_travel_remote_storage(
        &self,
        time_travel_req: &TenantTimeTravelRequest,
@@ -1671,60 +1460,6 @@ impl Service {
                        })?;
            }
        }
-        Ok(())
-    }
-
-    pub(crate) async fn tenant_secondary_download(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<(), ApiError> {
-        // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                for node_id in shard.intent.get_secondary() {
-                    let node = locked
-                        .nodes
-                        .get(node_id)
-                        .expect("Pageservers may not be deleted while referenced");
-
-                    targets.push((*tenant_shard_id, node.clone()));
-                }
-            }
-            targets
-        };
-
-        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
-        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
-
-        // Issue concurrent requests to all shards' locations
-        let mut futs = FuturesUnordered::new();
-        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-            futs.push(async move {
-                let result = client.tenant_secondary_download(tenant_shard_id).await;
-                (result, node)
-            })
-        }
-
-        // Handle any errors returned by pageservers.  This includes cases like this request racing with
-        // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
-        // well as more general cases like 503s, 500s, or timeouts.
-        while let Some((result, node)) = futs.next().await {
-            let Err(e) = result else { continue };
-
-            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
-            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
-            // than they had hoped for.
-            tracing::warn!(
-                "Ignoring tenant secondary download error from pageserver {}: {e}",
-                node.id,
-            );
-        }

        Ok(())
    }
@@ -2304,8 +2039,8 @@ impl Service {
                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                    // populate the correct generation as part of its transaction, to protect us
                    // against racing with changes in the state of the parent.
-                    generation: None,
-                    generation_pageserver: Some(target.node.id.0 as i64),
+                    generation: 0,
+                    generation_pageserver: target.node.id.0 as i64,
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    // TODO: get the config out of the map
                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2426,8 +2161,7 @@ impl Service {
                        .expect("It was present, we just split it");
                    let old_attached = old_state.intent.get_attached().unwrap();
                    old_state.intent.clear(scheduler);
-                    let generation = old_state.generation.expect("Shard must have been attached");
-                    (old_attached, generation, old_state.config.clone())
+                    (old_attached, old_state.generation, old_state.config.clone())
                };

                for child in child_ids {
@@ -2448,7 +2182,7 @@ impl Service {
                    child_state.observed = ObservedState {
                        locations: child_observed,
                    };
-                    child_state.generation = Some(generation);
+                    child_state.generation = generation;
                    child_state.config = config.clone();

                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2513,7 +2247,6 @@ impl Service {
                match shard.policy {
                    PlacementPolicy::Single => {
                        shard.intent.clear_secondary(scheduler);
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                    }
                    PlacementPolicy::Double(_n) => {
                        // If our new attached node was a secondary, it no longer should be.
@@ -2523,12 +2256,6 @@ impl Service {
                        if let Some(old_attached) = old_attached {
                            shard.intent.push_secondary(scheduler, old_attached);
                        }
-
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
-                    }
-                    PlacementPolicy::Secondary => {
-                        shard.intent.clear(scheduler);
-                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
                    }
                    PlacementPolicy::Detached => {
                        return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2536,6 +2263,9 @@ impl Service {
                        )))
                    }
                }
+                shard
+                    .intent
+                    .set_attached(scheduler, Some(migrate_req.node_id));

                tracing::info!("Migrating: new intent {:?}", shard.intent);
                shard.sequence = shard.sequence.next();
@@ -2863,7 +2593,7 @@ impl Service {
                    observed_loc.conf = None;
                }

-                if tenant_state.intent.demote_attached(config_req.node_id) {
+                if tenant_state.intent.notify_offline(config_req.node_id) {
                    tenant_state.sequence = tenant_state.sequence.next();
                    match tenant_state.schedule(scheduler) {
                        Err(e) => {
@@ -2930,9 +2660,6 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
-    ///
-    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
-    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -53,11 +53,8 @@ pub(crate) struct TenantState {
    pub(crate) sequence: Sequence,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching.
-    //
-    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
-    // API, where this tenant may only run in PlacementPolicy::Secondary.
-    pub(crate) generation: Option<Generation>,
+    // and use the incremented number when attaching
+    pub(crate) generation: Generation,

    // High level description of how the tenant should be set up.  Provided
    // externally.
@@ -184,13 +181,6 @@ impl IntentState {
        }
    }

-    /// Remove the last secondary node from the list of secondaries
-    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
-        if let Some(node_id) = self.secondary.pop() {
-            scheduler.node_dec_ref(node_id);
-        }
-    }
-
    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
        if let Some(old_attached) = self.attached.take() {
            scheduler.node_dec_ref(old_attached);
@@ -218,13 +208,11 @@ impl IntentState {
        &self.secondary
    }

-    /// If the node is in use as the attached location, demote it into
-    /// the list of secondary locations.  This is used when a node goes offline,
-    /// and we want to use a different node for attachment, but not permanently
-    /// forget the location on the offline node.
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
    ///
    /// Returns true if a change was made
-    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
        if self.attached == Some(node_id) {
            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
            // need to call into it here.
@@ -327,7 +315,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) result: Result<(), ReconcileError>,

    pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Option<Generation>,
+    pub(crate) generation: Generation,
    pub(crate) observed: ObservedState,

    /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -352,7 +340,7 @@ impl TenantState {
            tenant_shard_id,
            policy,
            intent: IntentState::default(),
-            generation: Some(Generation::new(0)),
+            generation: Generation::new(0),
            shard,
            observed: ObservedState::default(),
            config: TenantConfig::default(),
@@ -450,16 +438,10 @@ impl TenantState {
        // more work on the same pageservers we're already using.
        let mut modified = false;

-        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
            Single => {
                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-
                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

@@ -469,23 +451,6 @@ impl TenantState {
                }
            }
            Double(secondary_count) => {
-                let retain_secondaries = if self.intent.attached.is_none()
-                    && scheduler.node_preferred(&self.intent.secondary).is_some()
-                {
-                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
-                    // one more secondary than we usually would, as one of them will become attached futher down this function.
-                    secondary_count + 1
-                } else {
-                    secondary_count
-                };
-
-                while self.intent.secondary.len() > retain_secondaries {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-
                // Should have exactly one attached, and N secondaries
                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;
@@ -498,28 +463,15 @@ impl TenantState {
                    modified = true;
                }
            }
-            Secondary => {
-                if let Some(node_id) = self.intent.get_attached() {
-                    // Populate secondary by demoting the attached node
-                    self.intent.demote_attached(*node_id);
-                    modified = true;
-                } else if self.intent.secondary.is_empty() {
-                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[])?;
-                    self.intent.push_secondary(scheduler, node_id);
-                    modified = true;
-                }
-                while self.intent.secondary.len() > 1 {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-            }
            Detached => {
-                // Never add locations in this mode
-                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
-                    self.intent.clear(scheduler);
+                // Should have no attached or secondary pageservers
+                if self.intent.attached.is_some() {
+                    self.intent.set_attached(scheduler, None);
+                    modified = true;
+                }
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
            }
@@ -566,12 +518,7 @@ impl TenantState {

    fn dirty(&self) -> bool {
        if let Some(node_id) = self.intent.attached {
-            // Maybe panic: it is a severe bug if we try to attach while generation is null.
-            let generation = self
-                .generation
-                .expect("Attempted to enter attached state without a generation");
-
-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -649,10 +596,6 @@ impl TenantState {
        // Reconcile already in flight for the current sequence?
        if let Some(handle) = &self.reconciler {
            if handle.sequence == self.sequence {
-                tracing::info!(
-                    "Reconciliation already in progress for sequence {:?}",
-                    self.sequence,
-                );
                return Some(ReconcilerWaiter {
                    tenant_shard_id: self.tenant_shard_id,
                    seq_wait: self.waiter.clone(),
@@ -672,10 +615,6 @@ impl TenantState {
            return None;
        };

-        // Advance the sequence before spawning a reconciler, so that sequence waiters
-        // can distinguish between before+after the reconcile completes.
-        self.sequence = self.sequence.next();
-
        let reconciler_cancel = cancel.child_token();
        let mut reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
@@ -777,17 +716,6 @@ impl TenantState {
        })
    }

-    /// Called when a ReconcileResult has been emitted and the service is updating
-    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
-    /// the handle to indicate there is no longer a reconciliation in progress.
-    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
-        if let Some(reconcile_handle) = &self.reconciler {
-            if reconcile_handle.sequence <= sequence {
-                self.reconciler = None;
-            }
-        }
-    }
-
    // If we had any state at all referring to this node ID, drop it.  Does not
    // attempt to reschedule.
    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -808,8 +736,13 @@ impl TenantState {
            shard_number: self.tenant_shard_id.shard_number.0 as i32,
            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
            shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
-            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
+            generation: self.generation.into().unwrap_or(0) as i32,
+            generation_pageserver: self
+                .intent
+                .get_attached()
+                .map(|n| n.0 as i64)
+                .unwrap_or(i64::MAX),
+
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
@@ -872,10 +805,8 @@ pub(crate) mod tests {
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.notify_offline(attached_node_id);
        assert!(changed);
-        assert!(tenant_state.intent.attached.is_none());
-        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -200,7 +200,7 @@ impl AttachmentService {
                "localhost",
                "-p",
                &format!("{}", self.postgres_port),
-                DB_NAME,
+                &DB_NAME,
            ])
            .output()
            .await
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -605,7 +605,7 @@ impl Endpoint {
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
        if create_test_user {
-            let conn_str = self.connstr("test", "neondb");
+            let conn_str = self.connstr("user", "neondb");
            println!("Also at '{}'", conn_str);
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -14,6 +14,7 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
+use strum_macros;
 use utils::{
    completion,
    history_buffer::HistoryBufferWithDropCounter,
@@ -1076,6 +1077,7 @@ impl PagestreamBeMessage {

 #[cfg(test)]
 mod tests {
+    use bytes::Buf;
    use serde_json::json;

    use super::*;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
+use thiserror;
 use utils::id::TenantId;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -655,7 +656,10 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke

 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};

    use super::*;

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,7 +623,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use bytes::Bytes;
    use camino_tempfile::tempdir;
+    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_check_metadata(
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
            Some("test/prefix/"),
            Some("/test/prefix/"),
        ];
-        let expected_outputs = [
+        let expected_outputs = vec![
            vec!["", "some/path", "some/path"],
            vec!["/", "/some/path", "/some/path"],
            vec![
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,6 +1,7 @@
 // For details about authentication see docs/authentication.md

 use arc_swap::ArcSwap;
+use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,9 +4,7 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion {
-    _token: TaskTrackerToken,
-}
+pub struct Completion(TaskTrackerToken);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -51,5 +49,5 @@ pub fn channel() -> (Completion, Barrier) {
    tracker.close();

    let token = tracker.token();
-    (Completion { _token: token }, Barrier(tracker))
+    (Completion(token), Barrier(tracker))
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
        Self::Broken
    }

-    pub const fn new(v: u32) -> Self {
+    pub fn new(v: u32) -> Self {
        Self::Valid(v)
    }

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{debug, info, info_span, warn, Instrument};
+use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
 use std::str::FromStr;
@@ -156,10 +156,6 @@ pub struct ChannelWriter {
    buffer: BytesMut,
    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
    written: usize,
-    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
-    /// buffer because we cannot know anything about that, but this should allow us to understand
-    /// the actual time taken without the time spent `std::thread::park`ed.
-    wait_time: std::time::Duration,
 }

 impl ChannelWriter {
@@ -172,7 +168,6 @@ impl ChannelWriter {
            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
            tx,
            written: 0,
-            wait_time: std::time::Duration::ZERO,
        }
    }

@@ -185,8 +180,6 @@ impl ChannelWriter {
        tracing::trace!(n, "flushing");
        let ready = self.buffer.split().freeze();

-        let wait_started_at = std::time::Instant::now();
-
        // not ideal to call from blocking code to block_on, but we are sure that this
        // operation does not spawn_blocking other tasks
        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -199,9 +192,6 @@ impl ChannelWriter {
            // sending it to the client.
            Ok(())
        });
-
-        self.wait_time += wait_started_at.elapsed();
-
        if res.is_err() {
            return Err(std::io::ErrorKind::BrokenPipe.into());
        }
@@ -212,10 +202,6 @@ impl ChannelWriter {
    pub fn flushed_bytes(&self) -> usize {
        self.written
    }
-
-    pub fn wait_time(&self) -> std::time::Duration {
-        self.wait_time
-    }
 }

 impl std::io::Write for ChannelWriter {
@@ -266,52 +252,22 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body

    let span = info_span!("blocking");
    tokio::task::spawn_blocking(move || {
-        // there are situations where we lose scraped metrics under load, try to gather some clues
-        // since all nodes are queried this, keep the message count low.
-        let spawned_at = std::time::Instant::now();
-
        let _span = span.entered();
-
        let metrics = metrics::gather();
-
-        let gathered_at = std::time::Instant::now();
-
        let res = encoder
            .encode(&metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

-        // this instant is not when we finally got the full response sent, sending is done by hyper
-        // in another task.
-        let encoded_at = std::time::Instant::now();
-
-        let spawned_in = spawned_at - started_at;
-        let collected_in = gathered_at - spawned_at;
-        // remove the wait time here in case the tcp connection was clogged
-        let encoded_in = encoded_at - gathered_at - writer.wait_time();
-        let total = encoded_at - started_at;
-
        match res {
            Ok(()) => {
                tracing::info!(
                    bytes = writer.flushed_bytes(),
-                    total_ms = total.as_millis(),
-                    spawning_ms = spawned_in.as_millis(),
-                    collection_ms = collected_in.as_millis(),
-                    encoding_ms = encoded_in.as_millis(),
+                    elapsed_ms = started_at.elapsed().as_millis(),
                    "responded /metrics"
                );
            }
            Err(e) => {
-                // there is a chance that this error is not the BrokenPipe we generate in the writer
-                // for "closed connection", but it is highly unlikely.
-                tracing::warn!(
-                    after_bytes = writer.flushed_bytes(),
-                    total_ms = total.as_millis(),
-                    spawning_ms = spawned_in.as_millis(),
-                    collection_ms = collected_in.as_millis(),
-                    encoding_ms = encoded_in.as_millis(),
-                    "failed to write out /metrics response: {e:?}"
-                );
+                tracing::warn!("failed to write out /metrics response: {e:#}");
                // semantics of this error are quite... unclear. we want to error the stream out to
                // abort the response to somehow notify the client that we failed.
                //
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,6 +415,7 @@ mod tests {

    use super::*;

+    use serde::ser::Serialize;
    use serde_assert::{Deserializer, Serializer, Token, Tokens};

    #[test]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]

-use std::cmp::{Eq, Ordering};
+use std::cmp::{Eq, Ordering, PartialOrd};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,6 +249,7 @@ where
 mod tests {
    use super::*;
    use std::sync::Arc;
+    use std::time::Duration;

    impl MonotonicCounter<i32> for i32 {
        fn cnt_advance(&mut self, val: i32) {
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use std::sync::Mutex;
+    use std::sync::{Arc, Mutex};
    use std::time::Duration;

    #[tokio::test]
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,6 +239,7 @@ mod tests {
    use std::{
        convert::Infallible,
        pin::{pin, Pin},
+        sync::atomic::{AtomicUsize, Ordering},
        time::Duration,
    };

--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -6,6 +6,7 @@ use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pin_project_lite::pin_project;
+use std::cmp::Ord;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
 use std::future::Future;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use toml_edit;
 use toml_edit::{Document, Item};

 use camino::{Utf8Path, Utf8PathBuf};
@@ -211,9 +212,9 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
-    ///
-    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
+    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
+    /// loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
@@ -1202,7 +1203,10 @@ impl ConfigurableSemaphore {

 #[cfg(test)]
 mod tests {
-    use std::{fs, num::NonZeroU32};
+    use std::{
+        fs,
+        num::{NonZeroU32, NonZeroUsize},
+    };

    use camino_tempfile::{tempdir, Utf8TempDir};
    use pageserver_api::models::EvictionPolicy;
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,5 +1,7 @@
 use super::*;
 use std::collections::HashMap;
+use std::time::SystemTime;
+use utils::lsn::Lsn;

 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -20,9 +20,10 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
+use tokio;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use tracing::{debug, error};
+use tracing::{self, debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -725,7 +726,7 @@ mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
    use pageserver_api::shard::ShardIndex;
-    use std::io::ErrorKind;
+    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -734,7 +735,10 @@ mod test {
    use crate::{
        control_plane_client::RetryForeverError,
        repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
+        tenant::{
+            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
+            storage_layer::DeltaFileName,
+        },
    };

    use super::*;
@@ -1157,8 +1161,13 @@ mod test {
 pub(crate) mod mock {
    use tracing::info;

+    use crate::tenant::remote_timeline_client::remote_layer_path;
+
    use super::*;
-    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    };

    pub struct ConsumerState {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -58,7 +58,6 @@ use utils::{completion, id::TimelineId};

 use crate::{
    config::PageServerConf,
-    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
@@ -66,6 +65,7 @@ use crate::{
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        Timeline,
    },
 };

@@ -409,23 +409,13 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let (candidates, collection_time) = {
-        let started_at = std::time::Instant::now();
+    let candidates =
        match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
            EvictionCandidates::Cancelled => {
                return Ok(IterationOutcome::Cancelled);
            }
-            EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
-        }
-    };
-
-    METRICS.layers_collected.inc_by(candidates.len() as u64);
-
-    tracing::info!(
-        elapsed_ms = collection_time.as_millis(),
-        total_layers = candidates.len(),
-        "collection completed"
-    );
+            EvictionCandidates::Finished(partitioned) => partitioned,
+        };

    // Debug-log the list of candidates
    let now = SystemTime::now();
@@ -456,10 +446,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.

-    let (evicted_amount, usage_planned) =
-        select_victims(&candidates, usage_pre).into_amount_and_planned();
+    let selection = select_victims(&candidates, usage_pre);

-    METRICS.layers_selected.inc_by(evicted_amount as u64);
+    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers

@@ -488,15 +477,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
-                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
-                    Ok(Err((
-                        file_size,
-                        EvictionError::NotFound
-                        | EvictionError::Downloaded
-                        | EvictionError::Timeout,
-                    ))) => {
+                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
                        evictions_failed.file_sizes += file_size;
                        evictions_failed.count += 1;
                    }
@@ -512,10 +495,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

            // calling again when consumed_all is fine as evicted is fused.
            let Some((_partition, candidate)) = evicted.next() else {
-                if !consumed_all {
-                    tracing::info!("all evictions started, waiting");
-                    consumed_all = true;
-                }
+                consumed_all = true;
                continue;
            };

@@ -523,15 +503,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                EvictionLayer::Attached(layer) => {
                    let file_size = layer.layer_desc().file_size;
                    js.spawn(async move {
-                        // have a low eviction waiting timeout because our LRU calculations go stale fast;
-                        // also individual layer evictions could hang because of bugs and we do not want to
-                        // pause disk_usage_based_eviction for such.
-                        let timeout = std::time::Duration::from_secs(5);
-
-                        match layer.evict_and_wait(timeout).await {
-                            Ok(()) => Ok(file_size),
-                            Err(e) => Err((file_size, e)),
-                        }
+                        layer
+                            .evict_and_wait()
+                            .await
+                            .map(|()| file_size)
+                            .map_err(|e| (file_size, e))
                    });
                }
                EvictionLayer::Secondary(layer) => {
@@ -553,30 +529,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        (usage_assumed, evictions_failed)
    };

-    let started_at = std::time::Instant::now();
-
-    let evict_layers = async move {
-        let mut evict_layers = std::pin::pin!(evict_layers);
-
-        let maximum_expected = std::time::Duration::from_secs(10);
-
-        let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
-        let tuple = if let Ok(tuple) = res {
-            tuple
-        } else {
-            let elapsed = started_at.elapsed();
-            tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
-            evict_layers.await
-        };
-
-        let elapsed = started_at.elapsed();
-        tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
-        tuple
-    };
-
-    let evict_layers =
-        evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
-
    let (usage_assumed, evictions_failed) = tokio::select! {
        tuple = evict_layers => { tuple },
        _ = cancel.cancelled() => {
@@ -811,8 +763,6 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
-    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
-
    // get a snapshot of the list of tenants
    let tenants = tenant::mgr::list_tenants()
        .await
@@ -841,8 +791,6 @@ async fn collect_eviction_candidates(
            continue;
        }

-        let started_at = std::time::Instant::now();
-
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
@@ -857,7 +805,6 @@ async fn collect_eviction_candidates(
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
-
            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));

@@ -923,25 +870,7 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

-        METRICS
-            .tenant_layer_count
-            .observe(tenant_candidates.len() as f64);
-
        candidates.extend(tenant_candidates);
-
-        let elapsed = started_at.elapsed();
-        METRICS
-            .tenant_collection_time
-            .observe(elapsed.as_secs_f64());
-
-        if elapsed > LOG_DURATION_THRESHOLD {
-            tracing::info!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
-                elapsed_ms = elapsed.as_millis(),
-                "collection took longer than threshold"
-            );
-        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -956,11 +885,11 @@ async fn collect_eviction_candidates(
        },
    );

-    for tenant in secondary_tenants {
+    for secondary_tenant in secondary_tenants {
        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
        // to prevent repeated disk usage based evictions from completely draining less often
        // updating secondaries.
-        let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
+        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();

        debug_assert!(
            total_layers >= layer_info.resident_layers.len(),
@@ -968,8 +897,6 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

-        let started_at = std::time::Instant::now();
-
        layer_info
            .resident_layers
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -991,27 +918,9 @@ async fn collect_eviction_candidates(
                    )
                });

-        METRICS
-            .tenant_layer_count
-            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;
-
-        let elapsed = started_at.elapsed();
-
-        METRICS
-            .tenant_collection_time
-            .observe(elapsed.as_secs_f64());
-
-        if elapsed > LOG_DURATION_THRESHOLD {
-            tracing::info!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
-                elapsed_ms = elapsed.as_millis(),
-                "collection took longer than threshold"
-            );
-        }
    }

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -1088,6 +997,30 @@ impl<U: Usage> VictimSelection<U> {
    }
 }

+struct TimelineKey(Arc<Timeline>);
+
+impl PartialEq for TimelineKey {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for TimelineKey {}
+
+impl std::hash::Hash for TimelineKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.0).hash(state);
+    }
+}
+
+impl std::ops::Deref for TimelineKey {
+    type Target = Timeline;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
 /// A totally ordered f32 subset we can use with sorting functions.
 pub(crate) mod finite_f32 {

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -579,12 +579,6 @@ paths:
        required: false
        schema:
          type: integer
-      - name: lazy
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
    put:
      description: |
        Configures a _tenant location_, that is how a particular pageserver handles
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -816,7 +816,13 @@ async fn tenant_attach_handler(

    let tenant = state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
+        .upsert_location(
+            tenant_shard_id,
+            location_conf,
+            None,
+            SpawnMode::Normal,
+            &ctx,
+        )
        .await?;

    let Some(tenant) = tenant else {
@@ -1412,7 +1418,6 @@ async fn put_tenant_location_config_handler(

    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
-    let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1443,17 +1448,15 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    // lazy==true queues up for activation or jumps the queue like normal when a compute connects,
-    // similar to at startup ordering.
-    let spawn_mode = if lazy {
-        tenant::SpawnMode::Lazy
-    } else {
-        tenant::SpawnMode::Eager
-    };
-
    let attached = state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
+        .upsert_location(
+            tenant_shard_id,
+            location_conf,
+            flush,
+            tenant::SpawnMode::Normal,
+            &ctx,
+        )
        .await?
        .is_some();

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1915,16 +1915,17 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+            let _ =
+                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
-            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -2473,64 +2474,6 @@ pub(crate) mod tenant_throttling {
    }
 }

-pub(crate) mod disk_usage_based_eviction {
-    use super::*;
-
-    pub(crate) struct Metrics {
-        pub(crate) tenant_collection_time: Histogram,
-        pub(crate) tenant_layer_count: Histogram,
-        pub(crate) layers_collected: IntCounter,
-        pub(crate) layers_selected: IntCounter,
-        pub(crate) layers_evicted: IntCounter,
-    }
-
-    impl Default for Metrics {
-        fn default() -> Self {
-            let tenant_collection_time = register_histogram!(
-                "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
-                "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
-                vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
-            )
-            .unwrap();
-
-            let tenant_layer_count = register_histogram!(
-                "pageserver_disk_usage_based_eviction_tenant_collected_layers",
-                "Amount of layers gathered from a tenant",
-                vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
-            )
-            .unwrap();
-
-            let layers_collected = register_int_counter!(
-                "pageserver_disk_usage_based_eviction_collected_layers_total",
-                "Amount of layers collected"
-            )
-            .unwrap();
-
-            let layers_selected = register_int_counter!(
-                "pageserver_disk_usage_based_eviction_select_layers_total",
-                "Amount of layers selected"
-            )
-            .unwrap();
-
-            let layers_evicted = register_int_counter!(
-                "pageserver_disk_usage_based_eviction_evicted_layers_total",
-                "Amount of layers successfully evicted"
-            )
-            .unwrap();
-
-            Self {
-                tenant_collection_time,
-                tenant_layer_count,
-                layers_collected,
-                layers_selected,
-                layers_evicted,
-            }
-        }
-    }
-
-    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
-}
-
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
@@ -2565,7 +2508,6 @@ pub fn preinitialize_metrics() {
    Lazy::force(&TENANT_MANAGER);

    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
-    Lazy::force(&disk_usage_based_eviction::METRICS);

    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -73,6 +73,7 @@

 use std::{
    collections::{hash_map::Entry, HashMap},
+    convert::TryInto,
    sync::{
        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        Arc, Weak,
@@ -261,9 +262,7 @@ pub struct PageCache {
    size_metrics: &'static PageCacheSizeMetrics,
 }

-struct PinnedSlotsPermit {
-    _permit: tokio::sync::OwnedSemaphorePermit,
-}
+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);

 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -559,9 +558,9 @@ impl PageCache {
        )
        .await
        {
-            Ok(res) => Ok(PinnedSlotsPermit {
-                _permit: res.expect("this semaphore is never closed"),
-            }),
+            Ok(res) => Ok(PinnedSlotsPermit(
+                res.expect("this semaphore is never closed"),
+            )),
            Err(_timeout) => {
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -27,7 +27,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,6 +44,7 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
+use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
@@ -1114,10 +1115,7 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
        let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => {
-                set_tracing_field_shard_id(tl);
-                tl
-            }
+            Ok(tl) => tl,
            Err(key) => {
                match self
                    .load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1142,6 +1140,9 @@ impl PageServerHandler {
            }
        };

+        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
+        set_tracing_field_shard_id(timeline);
+
        let _timer = timeline
            .query_metrics
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,7 @@ impl Value {
 mod test {
    use super::*;

+    use bytes::Bytes;
    use utils::bin_ser::BeSer;

    macro_rules! roundtrip {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -109,6 +109,7 @@ pub use pageserver_api::models::TenantState;
 use tokio::sync::Semaphore;

 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
+use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
@@ -226,11 +227,7 @@ pub(crate) struct TenantPreload {
 /// When we spawn a tenant, there is a special mode for tenant creation that
 /// avoids trying to read anything from remote storage.
 pub(crate) enum SpawnMode {
-    /// Activate as soon as possible
-    Eager,
-    /// Lazy activation in the background, with the option to skip the queue if the need comes up
-    Lazy,
-    /// Tenant has been created during the lifetime of this process
+    Normal,
    Create,
 }

@@ -703,37 +700,41 @@ impl Tenant {
                    .and_then(|x| x.initial_tenant_load_remote.take());

                enum AttachType<'a> {
-                    /// We are attaching this tenant lazily in the background.
-                    Warmup {
-                        _permit: tokio::sync::SemaphorePermit<'a>,
-                        during_startup: bool
-                    },
-                    /// We are attaching this tenant as soon as we can, because for example an
-                    /// endpoint tried to access it.
+                    // During pageserver startup, we are attaching this tenant lazily in the background
+                    Warmup(tokio::sync::SemaphorePermit<'a>),
+                    // During pageserver startup, we are attaching this tenant as soon as we can,
+                    // because a client tried to access it.
                    OnDemand,
-                    /// During normal operations after startup, we are attaching a tenant, and
-                    /// eager attach was requested.
+                    // During normal operations after startup, we are attaching a tenant.
                    Normal,
                }

-                let attach_type = if matches!(mode, SpawnMode::Lazy) {
-                    // Before doing any I/O, wait for at least one of:
-                    // - A client attempting to access to this tenant (on-demand loading)
-                    // - A permit becoming available in the warmup semaphore (background warmup)
-
+                // Before doing any I/O, wait for either or:
+                // - A client to attempt to access to this tenant (on-demand loading)
+                // - A permit to become available in the warmup semaphore (background warmup)
+                //
+                // Some-ness of init_order is how we know if we're attaching during startup or later
+                // in process lifetime.
+                let attach_type = if init_order.is_some() {
                    tokio::select!(
-                        permit = tenant_clone.activate_now_sem.acquire() => {
-                            let _ = permit.expect("activate_now_sem is never closed");
+                        _ = tenant_clone.activate_now_sem.acquire() => {
                            tracing::info!("Activating tenant (on-demand)");
                            AttachType::OnDemand
                        },
-                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
-                            tracing::info!("Activating tenant (warmup)");
-                            AttachType::Warmup {
-                                _permit,
-                                during_startup: init_order.is_some()
+                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            match permit_result {
+                                Ok(p) => {
+                                    tracing::info!("Activating tenant (warmup)");
+                                    AttachType::Warmup(p)
+                                }
+                                Err(_) => {
+                                    // This is unexpected: the warmup semaphore should stay alive
+                                    // for the lifetime of init_order.  Log a warning and proceed.
+                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
+                                    AttachType::Normal
+                                }
                            }
+
                        }
                        _ = tenant_clone.cancel.cancelled() => {
                            // This is safe, but should be pretty rare: it is interesting if a tenant
@@ -748,8 +749,6 @@ impl Tenant {
                        },
                    )
                } else {
-                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
-                    // concurrent_tenant_warmup queue
                    AttachType::Normal
                };

@@ -757,7 +756,7 @@ impl Tenant {
                    (SpawnMode::Create, _) => {
                        None
                    },
-                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
+                    (SpawnMode::Normal, Some(remote_storage)) => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(remote_storage, task_mgr::shutdown_token())
@@ -770,7 +769,7 @@ impl Tenant {
                            }
                        }
                    }
-                    (_, None) => {
+                    (SpawnMode::Normal, None) => {
                        let _preload_timer = TENANT.preload.start_timer();
                        None
                    }
@@ -829,7 +828,7 @@ impl Tenant {
                let attached = {
                    let _attach_timer = match mode {
                        SpawnMode::Create => None,
-                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
+                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                    };
                    tenant_clone.attach(preload, mode, &ctx).await
                };
@@ -851,7 +850,7 @@ impl Tenant {
                // It also prevents the warmup proccess competing with the concurrency limit on
                // logical size calculations: if logical size calculation semaphore is saturated,
                // then warmup will wait for that before proceeding to the next tenant.
-                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
+                if let AttachType::Warmup(_permit) = attach_type {
                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                    tracing::info!("Waiting for initial logical sizes while warming up...");
                    while futs.next().await.is_some() {}
@@ -924,7 +923,7 @@ impl Tenant {
                deleting: false,
                timelines: HashMap::new(),
            },
-            (None, _) => {
+            (None, SpawnMode::Normal) => {
                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
            }
        };
@@ -2383,7 +2382,7 @@ impl Tenant {
            self.tenant_shard_id,
            self.generation,
            self.shard_identity,
-            self.walredo_mgr.clone(),
+            self.walredo_mgr.as_ref().map(Arc::clone),
            resources,
            pg_version,
            state,
@@ -3592,18 +3591,25 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
+    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
+    use std::fs;
+    use std::sync::Arc;
    use utils::logging;
+    use utils::lsn::Lsn;

    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::walredo::apply_neon;
-    use crate::{repository::Key, walrecord::NeonWalRecord};
+    use crate::{
+        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
+    };

    use super::*;
+    use crate::tenant::config::{TenantConf, TenantConfOpt};
    use hex_literal::hex;
-    use utils::id::TenantId;
+    use utils::id::{TenantId, TimelineId};

    pub const TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3763,7 +3769,7 @@ pub(crate) mod harness {
            let preload = tenant
                .preload(&self.remote_storage, CancellationToken::new())
                .await?;
-            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;

            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3832,8 +3838,10 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
+    use once_cell::sync::Lazy;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
+    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -52,10 +52,7 @@ pub mod defaults {
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");

        tenant
-            .attach(preload, super::SpawnMode::Eager, ctx)
+            .attach(preload, super::SpawnMode::Normal, ctx)
            .await
            .context("attach")?;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -21,6 +21,7 @@
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
+use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;
@@ -699,6 +700,8 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -300,7 +300,7 @@ mod tests {
    use super::*;
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::BlockReaderRef;
+    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
-            SpawnMode::Lazy,
+            SpawnMode::Normal,
            &ctx,
        ) {
            Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {

                // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
                // the caller thinks they're creating but the tenant already existed.  We must switch to
-                // Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
                // rather than assuming it to be empty.
-                spawn_mode = SpawnMode::Eager;
+                spawn_mode = SpawnMode::Normal;
            }
            Some(TenantSlot::Secondary(state)) => {
                info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
            shard_identity,
            None,
            self.tenants,
-            SpawnMode::Eager,
+            SpawnMode::Normal,
            ctx,
        )?;

@@ -1521,7 +1521,7 @@ impl TenantManager {
                *child_shard,
                child_location_conf,
                None,
-                SpawnMode::Eager,
+                SpawnMode::Normal,
                ctx,
            )
            .await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
        shard_identity,
        None,
        &TENANTS,
-        SpawnMode::Eager,
+        SpawnMode::Normal,
        ctx,
    )
    .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(

    let tenant = guard
        .get(&tenant_shard_id)
-        .cloned()
+        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_shard_id}"))
        .map_err(|e| ApiError::NotFound(e.into()))?;

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1791,12 +1791,14 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant, Timeline,
+            storage_layer::Layer,
+            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };

    use std::collections::HashSet;
+    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
        format!("contents for {name}").into()
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(

 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";

-pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
+pub fn is_temp_download_file(path: &Utf8Path) -> bool {
    let extension = path.extension();
    match extension {
        Some(TEMP_DOWNLOAD_EXTENSION) => true,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
-use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
+use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};

 enum DownloadCommand {
    Download(TenantShardId),
@@ -121,10 +121,6 @@ impl SecondaryTenant {
        })
    }

-    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
-        self.tenant_shard_id
-    }
-
    pub(crate) async fn shutdown(&self) {
        self.cancel.cancel();

@@ -168,17 +164,16 @@ impl SecondaryTenant {
        self.detail.lock().unwrap().get_layers_for_eviction(self)
    }

-    /// Cancellation safe, but on cancellation the eviction will go through
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
    pub(crate) async fn evict_layer(
-        self: &Arc<Self>,
+        &self,
        conf: &PageServerConf,
        timeline_id: TimelineId,
        name: LayerFileName,
    ) {
        debug_assert_current_span_has_tenant_id();

-        let guard = match self.gate.enter() {
+        let _guard = match self.gate.enter() {
            Ok(g) => g,
            Err(_) => {
                tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -192,57 +187,35 @@ impl SecondaryTenant {
            .timeline_path(&self.tenant_shard_id, &timeline_id)
            .join(name.file_name());

-        let this = self.clone();
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        tokio::fs::remove_file(path)
+            .await
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting layer during eviction");

-        // spawn it to be cancellation safe
-        tokio::task::spawn_blocking(move || {
-            let _guard = guard;
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(path);
-
-            let not_found = deleted
-                .as_ref()
-                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
-
-            let deleted = if not_found {
-                false
-            } else {
-                deleted
-                    .map(|()| true)
-                    .fatal_err("Deleting layer during eviction")
-            };
-
-            if !deleted {
-                // skip updating accounting and putting perhaps later timestamp
-                return;
-            }
-
-            // Update the timeline's state.  This does not have to be synchronized with
-            // the download process, because:
-            // - If downloader is racing with us to remove a file (e.g. because it is
-            //   removed from heatmap), then our mutual .remove() operations will both
-            //   succeed.
-            // - If downloader is racing with us to download the object (this would require
-            //   multiple eviction iterations to race with multiple download iterations), then
-            //   if we remove it from the state, the worst that happens is the downloader
-            //   downloads it again before re-inserting, or we delete the file but it remains
-            //   in the state map (in which case it will be downloaded if this secondary
-            //   tenant transitions to attached and tries to access it)
-            //
-            // The important assumption here is that the secondary timeline state does not
-            // have to 100% match what is on disk, because it's a best-effort warming
-            // of the cache.
-            let mut detail = this.detail.lock().unwrap();
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&name);
-                timeline_detail.evicted_at.insert(name, now);
-            }
-        })
-        .await
-        .expect("secondary eviction should not have panicked");
+        // Update the timeline's state.  This does not have to be synchronized with
+        // the download process, because:
+        // - If downloader is racing with us to remove a file (e.g. because it is
+        //   removed from heatmap), then our mutual .remove() operations will both
+        //   succeed.
+        // - If downloader is racing with us to download the object (this would require
+        //   multiple eviction iterations to race with multiple download iterations), then
+        //   if we remove it from the state, the worst that happens is the downloader
+        //   downloads it again before re-inserting, or we delete the file but it remains
+        //   in the state map (in which case it will be downloaded if this secondary
+        //   tenant transitions to attached and tries to access it)
+        //
+        // The important assumption here is that the secondary timeline state does not
+        // have to 100% match what is on disk, because it's a best-effort warming
+        // of the cache.
+        let mut detail = self.detail.lock().unwrap();
+        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+            timeline_detail.on_disk_layers.remove(&name);
+            timeline_detail.evicted_at.insert(name, now);
+        }
    }
 }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -16,8 +16,7 @@ use crate::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
        remote_timeline_client::{
-            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
+            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
        },
        span::debug_assert_current_span_has_tenant_id,
        storage_layer::LayerFileName,
@@ -789,7 +788,7 @@ async fn init_timeline_state(
            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
            continue;
-        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
+        } else if crate::is_temporary(&file_path) {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
            if let Err(e) = tokio::fs::remove_file(&file_path)
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -18,6 +18,7 @@ use crate::{
 };

 use futures::Future;
+use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -72,7 +72,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug, Default)]
+#[derive(Debug)]
 pub struct ValueReconstructState {
    pub records: Vec<(Lsn, NeonWalRecord)>,
    pub img: Option<(Lsn, Bytes)>,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
+use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
-use std::time::{Duration, SystemTime};
+use std::time::SystemTime;
 use tracing::Instrument;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
@@ -208,15 +208,10 @@ impl Layer {
    /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
    /// re-downloaded, [`EvictionError::Downloaded`] is returned.
    ///
-    /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
-    /// will happen regardless the future returned by this method completing unless there is a
-    /// read access (currently including [`Layer::keep_resident`]) before eviction gets to
-    /// complete.
-    ///
    /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
    /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        self.0.evict_and_wait(timeout).await
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+        self.0.evict_and_wait().await
    }

    /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
@@ -368,7 +363,7 @@ impl Layer {
    ///
    /// Does not start local deletion, use [`Self::delete_on_drop`] for that
    /// separatedly.
-    #[cfg(any(feature = "testing", test))]
+    #[cfg(feature = "testing")]
    pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
        let mut rx = self.0.status.subscribe();

@@ -637,7 +632,7 @@ impl LayerInner {

    /// Cancellation safe, however dropping the future and calling this method again might result
    /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
        use tokio::sync::broadcast::error::RecvError;

        assert!(self.have_remote_client);
@@ -657,22 +652,16 @@ impl LayerInner {
        if strong.is_some() {
            // drop the DownloadedLayer outside of the holding the guard
            drop(strong);
-
-            // idea here is that only one evicter should ever get to witness a strong reference,
-            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
-            // cancelled eviction and signal us, like it currently does.
-            //
-            // a second concurrent evict_and_wait will not see a strong reference.
            LAYER_IMPL_METRICS.inc_started_evictions();
        }

-        match tokio::time::timeout(timeout, rx.recv()).await {
-            Ok(Ok(Status::Evicted)) => Ok(()),
-            Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
-            Ok(Err(RecvError::Closed)) => {
+        match rx.recv().await {
+            Ok(Status::Evicted) => Ok(()),
+            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
+            Err(RecvError::Closed) => {
                unreachable!("sender cannot be dropped while we are in &self method")
            }
-            Ok(Err(RecvError::Lagged(_))) => {
+            Err(RecvError::Lagged(_)) => {
                // this is quite unlikely, but we are blocking a lot in the async context, so
                // we might be missing this because we are stuck on a LIFO slot on a thread
                // which is busy blocking for a 1TB database create_image_layers.
@@ -685,7 +674,6 @@ impl LayerInner {
                    None => Ok(()),
                }
            }
-            Err(_timeout) => Err(EvictionError::Timeout),
        }
    }

@@ -1207,9 +1195,6 @@ pub(crate) enum EvictionError {
    /// Evictions must always lose to downloads in races, and this time it happened.
    #[error("layer was downloaded instead")]
    Downloaded,
-
-    #[error("eviction did not happen within timeout")]
-    Timeout,
 }

 /// Error internal to the [`LayerInner::get_or_maybe_download`]
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,173 +1,13 @@
 use futures::StreamExt;
-use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
-use tracing::Instrument;
 use utils::{
    completion::{self, Completion},
    id::TimelineId,
 };

 use super::*;
-use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
-use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
-
-/// Used in tests to advance a future to wanted await point, and not futher.
-const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
-
-/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
-/// timeout uses to advance futures.
-const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
-
-/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
-#[tokio::test]
-async fn smoke_test() {
-    let handle = BACKGROUND_RUNTIME.handle();
-
-    let h = TenantHarness::create("smoke_test").unwrap();
-    let span = h.span();
-    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-    let (tenant, _) = h.load().await;
-
-    let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
-
-    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-        .await
-        .unwrap();
-
-    let layer = {
-        let mut layers = {
-            let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
-        };
-
-        assert_eq!(layers.len(), 1);
-
-        layers.swap_remove(0)
-    };
-
-    // all layers created at pageserver are like `layer`, initialized with strong
-    // Arc<DownloadedLayer>.
-
-    let img_before = {
-        let mut data = ValueReconstructState::default();
-        layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
-            .await
-            .unwrap();
-        data.img
-            .take()
-            .expect("tenant harness writes the control file")
-    };
-
-    // important part is evicting the layer, which can be done when there are no more ResidentLayer
-    // instances -- there currently are none, only two `Layer` values, one in the layermap and on
-    // in scope.
-    layer.evict_and_wait(FOREVER).await.unwrap();
-
-    // double-evict returns an error, which is valid if both eviction_task and disk usage based
-    // eviction would both evict the same layer at the same time.
-
-    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
-    assert!(matches!(e, EvictionError::NotFound));
-
-    // on accesses when the layer is evicted, it will automatically be downloaded.
-    let img_after = {
-        let mut data = ValueReconstructState::default();
-        layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
-            .instrument(download_span.clone())
-            .await
-            .unwrap();
-        data.img.take().unwrap()
-    };
-
-    assert_eq!(img_before, img_after);
-
-    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
-    //
-    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
-    // artificially slow it down.
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
-
-    match layer
-        .evict_and_wait(std::time::Duration::ZERO)
-        .await
-        .unwrap_err()
-    {
-        EvictionError::Timeout => {
-            // expected, but note that the eviction is "still ongoing"
-            helper.release().await;
-            // exhaust spawn_blocking pool to ensure it is now complete
-            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
-                .await;
-        }
-        other => unreachable!("{other:?}"),
-    }
-
-    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
-    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
-    // downloaded locally.
-    let none = layer.keep_resident().await.unwrap();
-    assert!(
-        none.is_none(),
-        "Expected none, because eviction removed the local file, found: {none:?}"
-    );
-
-    // plain downloading is rarely needed
-    layer
-        .download_and_keep_resident()
-        .instrument(download_span)
-        .await
-        .unwrap();
-
-    // last important part is deletion on drop: gc and compaction use it for compacted L0 layers
-    // or fully garbage collected layers. deletion means deleting the local file, and scheduling a
-    // deletion of the already unlinked from index_part.json remote file.
-    //
-    // marking a layer to be deleted on drop is irreversible; there is no technical reason against
-    // reversiblity, but currently it is not needed so it is not provided.
-    layer.delete_on_drop();
-
-    let path = layer.local_path().to_owned();
-
-    // wait_drop produces an unconnected to Layer future which will resolve when the
-    // LayerInner::drop has completed.
-    let mut wait_drop = std::pin::pin!(layer.wait_drop());
-
-    // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
-    // until here
-    tokio::time::pause();
-    tokio::time::timeout(ADVANCE, &mut wait_drop)
-        .await
-        .expect_err("should had timed out because two strong references exist");
-
-    tokio::fs::metadata(&path)
-        .await
-        .expect("the local layer file still exists");
-
-    let rtc = timeline.remote_client.as_ref().unwrap();
-
-    {
-        let layers = &[layer];
-        let mut g = timeline.layers.write().await;
-        g.finish_gc_timeline(layers);
-        // this just updates the remote_physical_size for demonstration purposes
-        rtc.schedule_gc_update(layers).unwrap();
-    }
-
-    // when strong references are dropped, the file is deleted and remote deletion is scheduled
-    wait_drop.await;
-
-    let e = tokio::fs::metadata(&path)
-        .await
-        .expect_err("the local file is deleted");
-    assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
-
-    rtc.wait_completion().await.unwrap();
-
-    assert_eq!(rtc.get_remote_physical_size(), 0);
-}
+use crate::task_mgr::BACKGROUND_RUNTIME;
+use crate::tenant::harness::TenantHarness;

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
 /// time. Now both of them complete per Arc drop semantics.
@@ -201,10 +41,10 @@ async fn evict_and_wait_on_wanted_deleted() {
    let resident = layer.keep_resident().await.unwrap();

    {
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());

        // drive the future to await on the status channel
-        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");

@@ -275,10 +115,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {

    let resident = layer.keep_resident().await.unwrap();

-    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());

    // drive the future to await on the status channel
-    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
        .await
        .expect_err("should had been a timeout since we are holding the layer resident");
    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
@@ -298,7 +138,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {

    // because the keep_resident check alters wanted evicted without sending a message, we will
    // never get completed
-    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
        .await
        .expect("no timeout, because keep_resident re-initialized")
        .expect_err("eviction should not have succeeded because re-initialized");
@@ -318,10 +158,9 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
            .sum::<u64>()
    );

-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());

-    // advance to the wait on the queue
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
        .await
        .expect_err("timeout because spawn_blocking is clogged");

@@ -332,12 +171,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {

    helper.release().await;

-    // the second_eviction gets to run here
-    //
-    // synchronize to be *strictly* after the second_eviction spawn_blocking run
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
-
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
        .await
        .expect("eviction goes through now that spawn_blocking is unclogged")
        .expect("eviction should succeed, because version matches");
@@ -427,49 +261,3 @@ impl SpawnBlockingPoolHelper {
            .await
    }
 }
-
-#[test]
-fn spawn_blocking_pool_helper_actually_works() {
-    // create a custom runtime for which we know and control how many blocking threads it has
-    //
-    // because the amount is not configurable for our helper, expect the same amount as
-    // BACKGROUND_RUNTIME using the tokio defaults would have.
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(512)
-        .enable_all()
-        .build()
-        .unwrap();
-
-    let handle = rt.handle();
-
-    rt.block_on(async move {
-        // this will not return until all threads are spun up and actually executing the code
-        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
-        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
-
-        println!("consumed");
-
-        let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
-            // this will not get to run before we release
-        }));
-
-        println!("spawned");
-
-        tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
-            .await
-            .expect_err("the task should not have gotten to run yet");
-
-        println!("tried to join");
-
-        consumed.release().await;
-
-        println!("released");
-
-        tokio::time::timeout(std::time::Duration::from_secs(1), jh)
-            .await
-            .expect("no timeout")
-            .expect("no join error");
-
-        println!("joined");
-    });
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -10,7 +10,7 @@ mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
@@ -1512,14 +1512,10 @@ impl Timeline {
            return Ok(None);
        };

-        // curl has this by default
-        let timeout = std::time::Duration::from_secs(120);
-
-        match local_layer.evict_and_wait(timeout).await {
+        match local_layer.evict_and_wait().await {
            Ok(()) => Ok(Some(true)),
            Err(EvictionError::NotFound) => Ok(Some(false)),
            Err(EvictionError::Downloaded) => Ok(Some(false)),
-            Err(EvictionError::Timeout) => Ok(Some(false)),
        }
    }
 }
@@ -3422,10 +3418,26 @@ impl Timeline {
                let _g = span.entered();
                let new_delta =
                    Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
+                let new_delta_path = new_delta.local_path().to_owned();

-                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                // We just need to fsync the directory in which these inodes are linked,
-                // which we know to be the timeline directory.
+                // Sync it to disk.
+                //
+                // We must also fsync the timeline dir to ensure the directory entries for
+                // new layer files are durable.
+                //
+                // NB: timeline dir must be synced _after_ the file contents are durable.
+                // So, two separate fsyncs are required, they mustn't be batched.
+                //
+                // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
+                // files to flush, the fsync overhead can be reduces as follows:
+                // 1. write them all to temporary file names
+                // 2. fsync them
+                // 3. rename to the final name
+                // 4. fsync the parent directory.
+                // Note that (1),(2),(3) today happen inside write_to_disk().
+                //
+                // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
+                par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
                par_fsync::par_fsync(&[self_clone
                    .conf
                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
@@ -3658,10 +3670,25 @@ impl Timeline {
            }
        }

-        // The writer.finish() above already did the fsync of the inodes.
-        // We just need to fsync the directory in which these inodes are linked,
-        // which we know to be the timeline directory.
-        if !image_layers.is_empty() {
+        // Sync the new layer to disk before adding it to the layer map, to make sure
+        // we don't garbage collect something based on the new layer, before it has
+        // reached the disk.
+        //
+        // We must also fsync the timeline dir to ensure the directory entries for
+        // new layer files are durable
+        //
+        // Compaction creates multiple image layers. It would be better to create them all
+        // and fsync them all in parallel.
+        let all_paths = image_layers
+            .iter()
+            .map(|layer| layer.local_path().to_owned())
+            .collect::<Vec<_>>();
+
+        par_fsync::par_fsync_async(&all_paths)
+            .await
+            .context("fsync of newly created layer files")?;
+
+        if !all_paths.is_empty() {
            par_fsync::par_fsync_async(&[self
                .conf
                .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
@@ -4248,12 +4275,22 @@ impl Timeline {
                }
            }

-            // The writer.finish() above already did the fsync of the inodes.
-            // We just need to fsync the directory in which these inodes are linked,
-            // which we know to be the timeline directory.
+            // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
+            let layer_paths: Vec<Utf8PathBuf> = new_layers
+                .iter()
+                .map(|l| l.local_path().to_owned())
+                .collect();
+
+            // Fsync all the layer files and directory using multiple threads to
+            // minimize latency.
+            par_fsync::par_fsync_async(&layer_paths)
+                .await
+                .context("fsync all new layers")?;
+
            let timeline_dir = self
                .conf
                .timeline_path(&self.tenant_shard_id, &self.timeline_id);
+
            par_fsync::par_fsync_async(&[timeline_dir])
                .await
                .context("fsync of timeline dir")?;
@@ -5120,7 +5157,8 @@ mod tests {
        let harness =
            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();

-        let (tenant, ctx) = harness.load().await;
+        let ctx = any_context();
+        let tenant = harness.do_try_load(&ctx).await.unwrap();
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
@@ -5134,10 +5172,8 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

-        let forever = std::time::Duration::from_secs(120);
-
-        let first = layer.evict_and_wait(forever);
-        let second = layer.evict_and_wait(forever);
+        let first = async { layer.evict_and_wait().await };
+        let second = async { layer.evict_and_wait().await };

        let (first, second) = tokio::join!(first, second);

@@ -5156,6 +5192,12 @@ mod tests {
        }
    }

+    fn any_context() -> crate::context::RequestContext {
+        use crate::context::*;
+        use crate::task_mgr::*;
+        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+    }
+
    async fn find_some_layer(timeline: &Timeline) -> Layer {
        let layers = timeline.layers.read().await;
        let desc = layers
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -75,13 +75,14 @@ impl Timeline {

        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let ctx_adaptor = RequestContextAdaptor(ctx.clone());

        pageserver_compaction::compact_tiered::compact_tiered(
            &mut adaptor,
            end_lsn,
            target_file_size,
            fanout,
-            ctx,
+            &ctx_adaptor,
        )
        .await?;

@@ -142,13 +143,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
    type DeltaLayer = ResidentDeltaLayer;
    type ImageLayer = ResidentImageLayer;

-    type RequestContext = crate::context::RequestContext;
+    type RequestContext = RequestContextAdaptor;

    async fn get_layers(
        &mut self,
        key_range: &Range<Key>,
        lsn_range: &Range<Lsn>,
-        _ctx: &RequestContext,
+        _ctx: &RequestContextAdaptor,
    ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
        self.flush_updates().await?;

@@ -169,7 +170,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
        &mut self,
        key_range: &Range<Key>,
        lsn: Lsn,
-        _ctx: &RequestContext,
+        _ctx: &RequestContextAdaptor,
    ) -> anyhow::Result<Vec<Range<Key>>> {
        if lsn == self.keyspace.0 {
            Ok(pageserver_compaction::helpers::intersect_keyspace(
@@ -205,7 +206,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
        &mut self,
        lsn: Lsn,
        key_range: &Range<Key>,
-        ctx: &RequestContext,
+        ctx: &RequestContextAdaptor,
    ) -> anyhow::Result<()> {
        Ok(self.create_image_impl(lsn, key_range, ctx).await?)
    }
@@ -215,7 +216,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
        lsn_range: &Range<Lsn>,
        key_range: &Range<Key>,
        input_layers: &[ResidentDeltaLayer],
-        ctx: &RequestContext,
+        ctx: &RequestContextAdaptor,
    ) -> anyhow::Result<()> {
        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);

@@ -286,7 +287,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
    async fn delete_layer(
        &mut self,
        layer: &OwnArc<PersistentLayerDesc>,
-        _ctx: &RequestContext,
+        _ctx: &RequestContextAdaptor,
    ) -> anyhow::Result<()> {
        self.layers_to_delete.push(layer.clone().0);
        Ok(())
@@ -298,7 +299,7 @@ impl TimelineAdaptor {
        &mut self,
        lsn: Lsn,
        key_range: &Range<Key>,
-        ctx: &RequestContext,
+        ctx: &RequestContextAdaptor,
    ) -> Result<(), PageReconstructError> {
        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

@@ -360,7 +361,17 @@ impl TimelineAdaptor {
    }
 }

-impl CompactionRequestContext for crate::context::RequestContext {}
+pub struct RequestContextAdaptor(pub RequestContext);
+
+impl std::ops::Deref for RequestContextAdaptor {
+    type Target = RequestContext;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl CompactionRequestContext for RequestContextAdaptor {}

 #[derive(Debug, Clone)]
 pub struct OwnArc<T>(pub Arc<T>);
@@ -438,7 +449,10 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
 impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
    type DeltaEntry<'a> = DeltaEntry<'a>;

-    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+    async fn load_keys<'a>(
+        &self,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
        self.0.load_keys(ctx).await
    }
 }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -204,7 +204,6 @@ impl Timeline {
            evicted: usize,
            errors: usize,
            not_evictable: usize,
-            timeouts: usize,
            #[allow(dead_code)]
            skipped_for_shutdown: usize,
        }
@@ -268,11 +267,7 @@ impl Timeline {
                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
                    // this could cause a lot of allocations in some cases
-                    js.spawn(async move {
-                        layer
-                            .evict_and_wait(std::time::Duration::from_secs(5))
-                            .await
-                    });
+                    js.spawn(async move { layer.evict_and_wait().await });
                    stats.candidates += 1;
                }
            }
@@ -285,9 +280,6 @@ impl Timeline {
                    Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                        stats.not_evictable += 1;
                    }
-                    Ok(Err(EvictionError::Timeout)) => {
-                        stats.timeouts += 1;
-                    }
                    Err(je) if je.is_cancelled() => unreachable!("not used"),
                    Err(je) if je.is_panic() => {
                        /* already logged */
@@ -303,8 +295,7 @@ impl Timeline {
            stats = join_all => {
                if stats.candidates == stats.not_evictable {
                    debug!(stats=?stats, "eviction iteration complete");
-                } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
-                    // reminder: timeouts are not eviction cancellations
+                } else if stats.errors > 0 || stats.not_evictable > 0 {
                    warn!(stats=?stats, "eviction iteration complete");
                } else {
                    info!(stats=?stats, "eviction iteration complete");
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1667,6 +1667,8 @@ mod tests {
    use super::*;
    use crate::tenant::harness::*;
    use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
+    use crate::tenant::Timeline;
+    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
    use postgres_ffi::RELSEG_SIZE;

    use crate::DEFAULT_PG_VERSION;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -262,7 +262,7 @@ impl PostgresRedoManager {
            // next request will launch a new one.
            if let Err(e) = result.as_ref() {
                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
                    records.len(),
                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -252,6 +252,8 @@ mod test {
    use super::*;
    use std::collections::HashMap;

+    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
+
    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
    #[test]
    fn apply_aux_file_deltas() -> anyhow::Result<()> {
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,5 +1,7 @@
+use tracing;
+use tracing::error;
+use tracing::info;
 use tracing::instrument;
-use tracing::{error, info};

 use crate::metrics::WalRedoKillCause;
 use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

 EXTRA_CLEAN = \
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -25,8 +25,6 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
-#include "common/hashfn.h"
-#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -62,7 +60,6 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)

-#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))

 typedef struct FileCacheEntry
@@ -87,8 +84,6 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	hyperLogLogState wss_estimation; /* estimation of wroking set size */
-	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
 } FileCacheControl;

 static HTAB *lfc_hash;
@@ -237,14 +232,6 @@ lfc_shmem_startup(void)
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);

-		/* Initialize hyper-log-log structure for estimating working set size */
-		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
-
-		/* We need hashes in shared memory */
-		pfree(lfc_ctl->wss_estimation.hashesArr);
-		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
-		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
-
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
@@ -542,11 +529,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}

 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-	/* Approximate working set */
-	tag.blockNum = blkno;
-	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
 		/* Page is not cached */
@@ -985,21 +967,3 @@ local_cache_pages(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(funcctx);
 }
-
-PG_FUNCTION_INFO_V1(approximate_working_set_size);
-
-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	int32 dc = -1;
-	if (lfc_size_limit != 0)
-	{
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
-		if (reset)
-			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
-		LWLockRelease(lfc_lock);
-	}
-	PG_RETURN_INT32(dc);
-}
--- a/pgxn/neon/neon--1.2--1.3.sql
+++ b/pgxn/neon/neon--1.2--1.3.sql
@@ -1,9 +0,0 @@
-\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
-
-CREATE FUNCTION approximate_working_set_size(reset bool)
-RETURNS integer
-AS 'MODULE_PATHNAME', 'approximate_working_set_size'
-LANGUAGE C PARALLEL SAFE;
-
-GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;
-
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.3'
+default_version = '1.2'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;

 use anyhow::{anyhow, bail, ensure, Context};
-use clap::Arg;
+use clap::{self, Arg};
 use futures::TryFutureExt;
 use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -358,7 +358,8 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::scram::ServerSecret;
+    use crate::{console::AuthSecret, scram::ServerSecret};
+    use std::{sync::Arc, time::Duration};

    #[tokio::test]
    async fn test_project_info_cache_settings() {
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,4 +1,4 @@
-use serde::{Deserialize, Serialize};
+use serde::Deserialize;
 use std::fmt;

 use crate::auth::IpPattern;
@@ -98,16 +98,7 @@ pub struct MetricsAuxInfo {
    pub endpoint_id: EndpointId,
    pub project_id: ProjectId,
    pub branch_id: BranchId,
-    pub cold_start_info: Option<ColdStartInfo>,
-}
-
-#[derive(Debug, Serialize, Deserialize, Clone)]
-#[serde(rename_all = "snake_case")]
-pub enum ColdStartInfo {
-    Unknown = 0,
-    Warm = 1,
-    PoolHit = 2,
-    PoolMiss = 3,
+    pub is_cold_start: Option<bool>,
 }

 #[cfg(test)]
@@ -120,7 +111,6 @@ mod tests {
            "endpoint_id": "endpoint",
            "project_id": "project",
            "branch_id": "branch",
-            "cold_start_info": "unknown",
        })
    }

--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -4,7 +4,7 @@ use crate::{
 };
 use anyhow::Context;
 use once_cell::sync::Lazy;
-use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
+use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::{convert::Infallible, future};
 use tokio::net::{TcpListener, TcpStream};
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span};
 use uuid::Uuid;

 use crate::{
-    console::messages::{ColdStartInfo, MetricsAuxInfo},
+    console::messages::MetricsAuxInfo,
    error::ErrorKind,
    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
    BranchId, DbName, EndpointId, ProjectId, RoleName,
@@ -42,7 +42,7 @@ pub struct RequestMonitoring {
    error_kind: Option<ErrorKind>,
    pub(crate) auth_method: Option<AuthMethod>,
    success: bool,
-    cold_start_info: Option<ColdStartInfo>,
+    is_cold_start: Option<bool>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -91,7 +91,7 @@ impl RequestMonitoring {
            error_kind: None,
            auth_method: None,
            success: false,
-            cold_start_info: None,
+            is_cold_start: None,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
@@ -115,7 +115,7 @@ impl RequestMonitoring {
        self.set_endpoint_id(x.endpoint_id);
        self.branch = Some(x.branch_id);
        self.project = Some(x.project_id);
-        self.cold_start_info = x.cold_start_info;
+        self.is_cold_start = x.is_cold_start;
    }

    pub fn set_project_id(&mut self, project_id: ProjectId) {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -93,7 +93,7 @@ struct RequestData {
    /// Or if we make it to proxy_pass
    success: bool,
    /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<String>,
+    is_cold_start: Option<bool>,
    /// Tracks time from session start (HTTP request/libpq TCP handshake)
    /// Through to success/failure
    duration_us: u64,
@@ -121,10 +121,7 @@ impl From<RequestMonitoring> for RequestData {
            region: value.region,
            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
            success: value.success,
-            cold_start_info: value
-                .cold_start_info
-                .as_ref()
-                .map(|x| serde_json::to_string(x).unwrap_or_default()),
+            is_cold_start: value.is_cold_start,
            duration_us: SystemTime::from(value.first_packet)
                .elapsed()
                .unwrap_or_default()
@@ -458,7 +455,7 @@ mod tests {
            region: "us-east-1",
            error: None,
            success: rng.gen(),
-            cold_start_info: Some("no".into()),
+            is_cold_start: Some(true),
            duration_us: rng.gen_range(0..30_000_000),
        }
    }
@@ -528,16 +525,16 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1314406, 3, 6000),
-                (1314399, 3, 6000),
-                (1314459, 3, 6000),
-                (1314416, 3, 6000),
-                (1314546, 3, 6000),
-                (1314388, 3, 6000),
-                (1314180, 3, 6000),
-                (1314416, 3, 6000),
-                (438359, 1, 2000)
-            ]
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
+            ],
        );

        tmpdir.close().unwrap();
@@ -566,12 +563,12 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1220668, 5, 10000),
-                (1226818, 5, 10000),
-                (1228612, 5, 10000),
-                (1227974, 5, 10000),
-                (1219252, 5, 10000)
-            ]
+                (1220433, 5, 10000),
+                (1226583, 5, 10000),
+                (1228377, 5, 10000),
+                (1227739, 5, 10000),
+                (1219017, 5, 10000)
+            ],
        );

        tmpdir.close().unwrap();
@@ -602,12 +599,12 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1206315, 5, 10000),
-                (1206046, 5, 10000),
-                (1206339, 5, 10000),
-                (1206327, 5, 10000),
-                (1206582, 5, 10000)
-            ]
+                (1206080, 5, 10000),
+                (1205811, 5, 10000),
+                (1206104, 5, 10000),
+                (1206092, 5, 10000),
+                (1206347, 5, 10000)
+            ],
        );

        tmpdir.close().unwrap();
@@ -631,16 +628,16 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1314406, 3, 6000),
-                (1314399, 3, 6000),
-                (1314459, 3, 6000),
-                (1314416, 3, 6000),
-                (1314546, 3, 6000),
-                (1314388, 3, 6000),
-                (1314180, 3, 6000),
-                (1314416, 3, 6000),
-                (438359, 1, 2000)
-            ]
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
+            ],
        );

        tmpdir.close().unwrap();
@@ -676,7 +673,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
+            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
        );

        tmpdir.close().unwrap();
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{http, sasl, scram};
+use crate::{auth, http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -11,6 +11,7 @@ use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
+use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};

--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -667,6 +667,7 @@ impl<C: ClientInnerExt> Drop for Client<C> {

 #[cfg(test)]
 mod tests {
+    use env_logger;
    use std::{mem, sync::atomic::AtomicBool};

    use super::*;
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -19,6 +19,8 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;

+use std::convert::TryInto;
+
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 7;

@@ -217,9 +219,12 @@ impl Storage for FileStorage {

 #[cfg(test)]
 mod test {
+    use super::FileStorage;
    use super::*;
+    use crate::SafeKeeperConf;
+    use anyhow::Result;
    use tokio::fs;
-    use utils::lsn::Lsn;
+    use utils::{id::TenantTimelineId, lsn::Lsn};

    fn stub_conf() -> SafeKeeperConf {
        let workdir = camino_tempfile::tempdir().unwrap().into_path();
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,7 +2,8 @@
 //! protocol commands.

 use anyhow::Context;
-use std::str::{self, FromStr};
+use std::str::FromStr;
+use std::str::{self};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
@@ -15,8 +16,8 @@ use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
-use postgres_backend::PostgresBackend;
 use postgres_backend::QueryError;
+use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2180,11 +2180,6 @@ class NeonAttachmentService(MetricsGetter):
        self.stop(immediate=True)


-@dataclass
-class LogCursor:
-    _line_no: int
-
-
 class NeonPageserver(PgProtocol):
    """
    An object representing a running pageserver.
@@ -2348,18 +2343,7 @@ class NeonPageserver(PgProtocol):
            value = self.http_client().get_metric_value(metric)
            assert value == 0, f"Nonzero {metric} == {value}"

-    def assert_log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Tuple[str, LogCursor]:
-        """Convenient for use inside wait_until()"""
-
-        res = self.log_contains(pattern, offset=offset)
-        assert res is not None
-        return res
-
-    def log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[Tuple[str, LogCursor]]:
+    def log_contains(self, pattern: str) -> Optional[str]:
        """Check that the pageserver log contains a line that matches the given regex"""
        logfile = self.workdir / "pageserver.log"
        if not logfile.exists():
@@ -2373,17 +2357,12 @@ class NeonPageserver(PgProtocol):
        # no guarantee it is already present in the log file. This hasn't
        # been a problem in practice, our python tests are not fast enough
        # to hit that race condition.
-        skip_until_line_no = 0 if offset is None else offset._line_no
-        cur_line_no = 0
        with logfile.open("r") as f:
            for line in f:
-                if cur_line_no < skip_until_line_no:
-                    cur_line_no += 1
-                    continue
                if contains_re.search(line):
                    # found it!
-                    cur_line_no += 1
-                    return (line, LogCursor(cur_line_no))
+                    return line
+
        return None

    def tenant_attach(
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -286,11 +286,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        self.verbose_error(res)

    def tenant_location_conf(
-        self,
-        tenant_id: Union[TenantId, TenantShardId],
-        location_conf=dict[str, Any],
-        flush_ms=None,
-        lazy: Optional[bool] = None,
+        self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
    ):
        body = location_conf.copy()
        body["tenant_id"] = str(tenant_id)
@@ -299,9 +295,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        if flush_ms is not None:
            params["flush_ms"] = str(flush_ms)

-        if lazy is not None:
-            params["lazy"] = "true" if lazy else "false"
-
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
            json=body,
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -20,7 +20,7 @@ def assert_tenant_state(
    tenant: TenantId,
    expected_state: str,
    message: Optional[str] = None,
-) -> None:
+):
    tenant_status = pageserver_http.tenant_status(tenant)
    log.info(f"tenant_status: {tenant_status}")
    assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
            return current_lsn
        if i % 10 == 0:
            log.info(
-                "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                    tenant, timeline, lsn, current_lsn, i + 1
+                "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
+                    lsn, current_lsn, i + 1
                )
            )
        time.sleep(0.1)
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
    iterations: int = 20,
    interval: Optional[float] = None,
    **delete_args,
-) -> None:
+):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)

@@ -302,7 +302,7 @@ def assert_prefix_empty(
    remote_storage: Optional[RemoteStorage],
    prefix: Optional[str] = None,
    allowed_postfix: Optional[str] = None,
-) -> None:
+):
    assert remote_storage is not None
    response = list_prefix(remote_storage, prefix)
    keys = response["KeyCount"]
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,16 +252,6 @@ class S3Storage:

        log.info(f"deleted {cnt} objects from remote storage")

-    def tenant_path(self, tenant_id: TenantId) -> str:
-        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
-
-    def heatmap_key(self, tenant_id: TenantId) -> str:
-        return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
-
-    def heatmap_content(self, tenant_id: TenantId):
-        r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
-        return json.loads(r["Body"].read().decode("utf-8"))
-

 RemoteStorage = Union[LocalFsStorage, S3Storage]

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -369,12 +369,7 @@ def start_in_background(
        return spawned_process


-WaitUntilRet = TypeVar("WaitUntilRet")
-
-
-def wait_until(
-    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
-) -> WaitUntilRet:
+def wait_until(number_of_iterations: int, interval: float, func: Fn):
    """
    Wait until 'func' returns successfully, without exception. Returns the
    last return value from the function.
@@ -392,18 +387,6 @@ def wait_until(
    raise Exception("timed out while waiting for %s" % func) from last_exception


-def assert_eq(a, b) -> None:
-    assert a == b
-
-
-def assert_gt(a, b) -> None:
-    assert a > b
-
-
-def assert_ge(a, b) -> None:
-    assert a >= b
-
-
 def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
    """
    Fast way to populate data.
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -63,11 +63,10 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
        ]
    )

-    wait_until(
-        50,
-        0.1,
-        lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
-    )
+    def log_contains_bad_request():
+        env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
+
+    wait_until(50, 0.1, log_contains_bad_request)


 def test_null_body(negative_env: NegativeTests):
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -200,7 +200,7 @@ class EvictionEnv:
                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)

        def statvfs_called():
-            pageserver.assert_log_contains(".*running mocked statvfs.*")
+            assert pageserver.log_contains(".*running mocked statvfs.*")

        # we most likely have already completed multiple runs
        wait_until(10, 1, statvfs_called)
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
    assert actual_change >= target, "eviction must always evict more than target"

    time.sleep(1)  # give log time to flush
-    env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
+    assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
    env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)


@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
    )

-    env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
+    assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
    env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")


@@ -801,9 +801,10 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
    )

-    wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
-    )
+    def relieved_log_message():
+        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
+
+    wait_until(10, 1, relieved_log_message)

    def less_than_max_usage_pct():
        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
@@ -844,9 +845,10 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
    )

-    wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
-    )
+    def relieved_log_message():
+        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
+
+    wait_until(10, 1, relieved_log_message)

    def more_than_min_avail_bytes_freed():
        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])

    time.sleep(10)  # let compaction to be performed
-    env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
+    assert env.pageserver.log_contains("compact-level0-phase1-return-same")


 def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
--- a/test_runner/regress/test_explain_with_lfc_stats.py
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -1,84 +0,0 @@
-from pathlib import Path
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
-
-
-def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    cache_dir = Path(env.repo_dir) / "file_cache"
-    cache_dir.mkdir(exist_ok=True)
-
-    branchname = "test_explain_with_lfc_stats"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
-    endpoint = env.endpoints.create_start(
-        branchname,
-        config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
-            "neon.max_file_cache_size='128MB'",
-            "neon.file_cache_size_limit='64MB'",
-        ],
-    )
-
-    cur = endpoint.connect().cursor()
-
-    log.info(f"preparing some data in {endpoint.connstr()}")
-
-    ddl = """
-CREATE TABLE pgbench_accounts (
-    aid bigint NOT NULL,
-    bid integer,
-    abalance integer,
-    filler character(84),
-    -- more web-app like columns
-    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
-    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
-)
-WITH (fillfactor='100');
-"""
-
-    cur.execute(ddl)
-    cur.execute(
-        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
-    )
-
-    log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
-    cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
-
-    log.info("running explain analyze without LFC values to verify they do not show up in the plan")
-    cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
-    rows = cur.fetchall()
-    plan = "\n".join(r[0] for r in rows)
-    log.debug(plan)
-    assert "Seq Scan on pgbench_accounts" in plan
-    assert "Buffers: shared hit" in plan
-    assert "File cache: hits=" not in plan
-    log.info("running explain analyze WITH LFC values to verify they do now show up")
-    cur.execute(
-        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
-    )
-    rows = cur.fetchall()
-    plan = "\n".join(r[0] for r in rows)
-    log.debug(plan)
-    assert "Seq Scan on pgbench_accounts" in plan
-    assert "Buffers: shared hit" in plan
-    assert "File cache: hits=" in plan
-    log.info("running explain analyze WITH LFC values to verify json output")
-    cur.execute(
-        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
-    )
-    jsonplan = cur.fetchall()[0][0]
-    log.debug(jsonplan)
-    # Directly access the 'Plan' part of the first element of the JSON array
-    plan_details = jsonplan[0]["Plan"]
-
-    # Extract "File Cache Hits" and "File Cache Misses"
-    file_cache_hits = plan_details.get("File Cache Hits")
-    file_cache_misses = plan_details.get("File Cache Misses")
-
-    # Now you can assert the values
-    assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
-    assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -184,13 +184,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):

    # NB: the layer file is unlinked index part now, but, because we made the delete
    # operation stuck, the layer file itself is still in the remote_storage
-    wait_until(
-        10,
-        0.5,
-        lambda: env.pageserver.assert_log_contains(
-            f".*{tenant_id}.*at failpoint.*{failpoint_name}"
-        ),
-    )
+    def delete_at_pause_point():
+        assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
+
+    wait_until(10, 0.5, delete_at_pause_point)
    future_layer_path = env.pageserver_remote_storage.remote_layer_path(
        tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
    )
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -1,74 +0,0 @@
-from pathlib import Path
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import query_scalar
-
-
-def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    cache_dir = Path(env.repo_dir) / "file_cache"
-    cache_dir.mkdir(exist_ok=True)
-
-    branchname = "test_approximate_working_set_size"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
-    endpoint = env.endpoints.create_start(
-        branchname,
-        config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
-            "neon.max_file_cache_size='128MB'",
-            "neon.file_cache_size_limit='64MB'",
-        ],
-    )
-
-    cur = endpoint.connect().cursor()
-    cur.execute("create extension neon")
-
-    log.info(f"preparing some data in {endpoint.connstr()}")
-
-    ddl = """
-CREATE TABLE pgbench_accounts (
-    aid bigint NOT NULL,
-    bid integer,
-    abalance integer,
-    filler character(84),
-    -- more web-app like columns
-    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
-    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
-)
-WITH (fillfactor='100');
-"""
-
-    cur.execute(ddl)
-    # prepare index access below
-    cur.execute(
-        "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
-    )
-    cur.execute(
-        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
-    )
-    # ensure correct query plans and stats
-    cur.execute("vacuum ANALYZE pgbench_accounts")
-    # determine table size - working set should approximate table size after sequential scan
-    pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
-    log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
-    cur.execute("select approximate_working_set_size(true)")
-    cur.execute(
-        'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
-    )
-    # verify working set size after sequential scan matches table size and reset working set for next test
-    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
-    log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
-    assert pages * 0.8 < blocks < pages * 1.2
-    # run a few point queries with index lookup
-    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =   4242")
-    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =  54242")
-    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
-    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
-    # verify working set size after some index access of a few select pages only
-    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
-    log.info(f"working set size after some index access of a few select pages only {blocks}")
-    assert blocks < 10
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
    def assert_logged():
        if not log_expected:
            return
-        env.pageserver.assert_log_contains(f".*{msg_id}.*")
+        assert env.pageserver.log_contains(f".*{msg_id}.*")

    wait_until(10, 0.5, assert_logged)

--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
            # IMPORTANT:
            # If the version has changed, the test should be updated.
            # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.3",)
+            assert cur.fetchone() == ("1.2",)
            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
            res = cur.fetchall()
            log.info(res)
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,12 +1,9 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.utils import wait_until


-@skip_on_postgres(
-    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
-)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
    env = neon_simple_env
    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(

    main_pageserver.start()

-    def assert_deletions_submitted(n: int) -> None:
+    def assert_deletions_submitted(n: int):
        assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n

    # After restart, issue a flush to kick the deletion frontend to do recovery.
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,4 +1,3 @@
-import json
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -11,7 +10,7 @@ from fixtures.pageserver.utils import (
    poll_for_remote_storage_iterations,
    tenant_delete_wait_completed,
 )
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -437,7 +436,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    )
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
    assert env.attachment_service is not None
-    assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter

    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
@@ -493,35 +491,18 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):

    # Do evictions on attached pageserver, check secondary follows along
    # ==================================================================
-    try:
-        log.info("Evicting a layer...")
-        layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
-        some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
-        log.info(f"Victim layer: {layer_to_evict.name}")
-        ps_attached.http_client().evict_layer(
-            tenant_id, timeline_id, layer_name=layer_to_evict.name
-        )
+    log.info("Evicting a layer...")
+    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)

-        log.info("Synchronizing after eviction...")
-        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-        heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
-        heatmap_layers = set(
-            layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
-        )
-        assert layer_to_evict.name not in heatmap_layers
-        assert some_other_layer.name in heatmap_layers
+    log.info("Synchronizing after eviction...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)

-        ps_secondary.http_client().tenant_secondary_download(tenant_id)
-
-        assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
-        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-            ps_secondary, tenant_id, timeline_id
-        )
-    except:
-        # On assertion failures, log some details to help with debugging
-        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
-        log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
-        raise
+    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )

    # Scrub the remote storage
    # ========================
--- a/test_runner/regress/test_pageserver_small_inmemory_layers.py
+++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py
@@ -1,110 +0,0 @@
-import asyncio
-import time
-from typing import Tuple
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    tenant_get_shards,
-)
-from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import wait_until
-
-TIMELINE_COUNT = 10
-ENTRIES_PER_TIMELINE = 10_000
-CHECKPOINT_TIMEOUT_SECONDS = 60
-
-TENANT_CONF = {
-    # Large `checkpoint_distance` effectively disables size
-    # based checkpointing.
-    "checkpoint_distance": f"{2 * 1024 ** 3}",
-    "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
-}
-
-
-async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
-    tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
-    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
-        conn = await ep.connect_async()
-        try:
-            await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
-            await conn.execute(
-                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
-            )
-        finally:
-            await conn.close(timeout=10)
-
-        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-        return tenant, timeline, last_flush_lsn
-
-
-async def workload(
-    env: NeonEnv, timelines: int, entries: int
-) -> list[Tuple[TenantId, TimelineId, Lsn]]:
-    workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
-    return await asyncio.gather(*workers)
-
-
-def wait_until_pageserver_is_caught_up(
-    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
-):
-    for tenant, timeline, last_flush_lsn in last_flush_lsns:
-        shards = tenant_get_shards(env, tenant)
-        for tenant_shard_id, pageserver in shards:
-            waited = wait_for_last_record_lsn(
-                pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
-            )
-            assert waited >= last_flush_lsn
-
-
-def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
-    def query():
-        value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
-        assert value is not None
-        return value
-
-    # The metric gets initialised on the first update.
-    # Retry a few times, but return 0 if it's stable.
-    try:
-        return float(wait_until(3, 0.5, query))
-    except Exception:
-        return 0
-
-
-@pytest.mark.parametrize("immediate_shutdown", [True, False])
-def test_pageserver_small_inmemory_layers(
-    neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
-):
-    """
-    Test that open layers get flushed after the `checkpoint_timeout` config
-    and do not require WAL reingest upon restart.
-
-    The workload creates a number of timelines and writes some data to each,
-    but not enough to trigger flushes via the `checkpoint_distance` config.
-    """
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
-    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
-
-    ps_http_client = env.pageserver.http_client()
-    total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
-
-    log.info("Sleeping for checkpoint timeout ...")
-    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
-
-    env.pageserver.restart(immediate=immediate_shutdown)
-    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
-
-    total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
-
-    log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
-    log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
-
-    leeway = total_wal_ingested_before_restart * 5 / 100
-    assert total_wal_ingested_after_restart <= leeway
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -28,14 +28,7 @@ from fixtures.remote_storage import (
    available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import (
-    assert_eq,
-    assert_ge,
-    assert_gt,
-    print_gc_result,
-    query_scalar,
-    wait_until,
-)
+from fixtures.utils import print_gc_result, query_scalar, wait_until
 from requests import ReadTimeout


@@ -127,10 +120,10 @@ def test_remote_storage_backup_and_restore(
        log.info(f"upload of checkpoint {checkpoint_number} is done")

    # Check that we had to retry the uploads
-    env.pageserver.assert_log_contains(
+    assert env.pageserver.log_contains(
        ".*failed to perform remote task UploadLayer.*, will retry.*"
    )
-    env.pageserver.assert_log_contains(
+    assert env.pageserver.log_contains(
        ".*failed to perform remote task UploadMetadata.*, will retry.*"
    )

@@ -299,9 +292,9 @@ def test_remote_storage_upload_queue_retries(
    print_gc_result(gc_result)
    assert gc_result["layers_removed"] > 0

-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
+    wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
+    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)

    # let all future operations queue up
    configure_storage_sync_failpoints("return")
@@ -329,17 +322,17 @@ def test_remote_storage_upload_queue_retries(
    churn_while_failpoints_active_thread.start()

    # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
+    wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
+    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)

    # unblock churn operations
    configure_storage_sync_failpoints("off")

    # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
+    wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
+    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)

    # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
    # so, give it some time to wrap up.
@@ -891,23 +884,26 @@ def wait_upload_queue_empty(
    wait_until(
        2,
        1,
-        lambda: assert_eq(
-            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
-        ),
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
+        )
+        == 0,
    )
    wait_until(
        2,
        1,
-        lambda: assert_eq(
-            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
-        ),
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
+        )
+        == 0,
    )
    wait_until(
        2,
        1,
-        lambda: assert_eq(
-            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
-        ),
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
+        )
+        == 0,
    )


--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
    # Marking a pageserver offline should migrate tenants away from it.
    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})

-    def node_evacuated(node_id: int) -> None:
+    def node_evacuated(node_id: int):
        counts = get_node_shard_counts(env, tenant_ids)
        assert counts[node_id] == 0

@@ -146,8 +146,6 @@ def test_sharding_service_smoke(
    for tid in tenant_ids:
        tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)

-    env.attachment_service.consistency_check()
-
    # Set a scheduling policy on one node, create all the tenants, observe
    # that the scheduling policy is respected.
    env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
@@ -258,8 +256,9 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
    env.attachment_service.consistency_check()


-@pytest.mark.parametrize("warm_up", [True, False])
-def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+def test_sharding_service_onboarding(
+    neon_env_builder: NeonEnvBuilder,
+):
    """
    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
    which provides the /location_config API.  This is similar to creating a tenant,
@@ -307,23 +306,6 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
        },
    )

-    if warm_up:
-        origin_ps.http_client().tenant_heatmap_upload(tenant_id)
-
-        # We expect to be called via live migration code, which may try to configure the tenant into secondary
-        # mode before attaching it.
-        virtual_ps_http.tenant_location_conf(
-            tenant_id,
-            {
-                "mode": "Secondary",
-                "secondary_conf": {"warm": True},
-                "tenant_conf": {},
-                "generation": None,
-            },
-        )
-
-        virtual_ps_http.tenant_secondary_download(tenant_id)
-
    # Call into attachment service to onboard the tenant
    generation += 1
    virtual_ps_http.tenant_location_conf(
@@ -369,9 +351,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
    assert len(dest_tenants) == 1
    assert TenantId(dest_tenants[0]["id"]) == tenant_id

-    # sharding service advances generation by 1 when it first attaches.  We started
-    # with a nonzero generation so this equality also proves that the generation
-    # was properly carried over during onboarding.
+    # sharding service advances generation by 1 when it first attaches
    assert dest_tenants[0]["generation"] == generation + 1

    # The onboarded tenant should survive a restart of sharding service
@@ -382,31 +362,6 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
    dest_ps.stop()
    dest_ps.start()

-    # Having onboarded via /location_config, we should also be able to update the
-    # TenantConf part of LocationConf, without inadvertently resetting the generation
-    modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
-    dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
-
-    # The generation has moved on since we onboarded
-    assert generation != dest_tenant_before_conf_change["generation"]
-
-    virtual_ps_http.tenant_location_conf(
-        tenant_id,
-        {
-            "mode": "AttachedSingle",
-            "secondary_conf": None,
-            "tenant_conf": modified_tenant_conf,
-            # This is intentionally a stale generation
-            "generation": generation,
-        },
-    )
-    dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
-    assert (
-        dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
-    )
-    dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
-    assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
-
    env.attachment_service.consistency_check()


@@ -450,7 +405,7 @@ def test_sharding_service_compute_hook(

    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})

-    def node_evacuated(node_id: int) -> None:
+    def node_evacuated(node_id: int):
        counts = get_node_shard_counts(env, [env.initial_tenant])
        assert counts[node_id] == 0

@@ -712,41 +667,3 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
        svc.request(
            "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
        )
-
-
-def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
-    """
-    Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
-    supplying the whole LocationConf.
-    """
-
-    env = neon_env_builder.init_start()
-    tenant_id = env.initial_tenant
-
-    http = env.attachment_service.pageserver_api()
-
-    default_value = "7days"
-    new_value = "1h"
-    http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
-
-    # Ensure the change landed on the storage controller
-    readback_controller = http.tenant_config(tenant_id)
-    assert readback_controller.effective_config["pitr_interval"] == new_value
-    assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
-
-    # Ensure the change made it down to the pageserver
-    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
-    assert readback_ps.effective_config["pitr_interval"] == new_value
-    assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
-
-    # Omitting a value clears it.  This looks different in storage controller
-    # vs. pageserver API calls, because pageserver has defaults.
-    http.set_tenant_config(tenant_id, {})
-    readback_controller = http.tenant_config(tenant_id)
-    assert readback_controller.effective_config["pitr_interval"] is None
-    assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
-    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
-    assert readback_ps.effective_config["pitr_interval"] == default_value
-    assert "pitr_interval" not in readback_ps.tenant_specific_overrides
-
-    env.attachment_service.consistency_check()
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
        "period": "20s",
        "threshold": "23h",
    }
-    assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
+    assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024

    # restart the pageserver and ensure that the config is still correct
    env.pageserver.stop()
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
        return ps_http.tenant_delete(tenant_id)

    def hit_remove_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")

    def hit_run_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
+        assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")

    with concurrent.futures.ThreadPoolExecutor() as executor:
        background_200_req = executor.submit(delete_tenant)
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
    Thread(target=timeline_create).start()

    def hit_initdb_upload_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
+        assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")

    wait_until(100, 0.1, hit_initdb_upload_failpoint)

    def creation_connection_timed_out():
-        env.pageserver.assert_log_contains(
+        assert env.pageserver.log_contains(
            "POST.*/timeline.* request was dropped before completing"
        )

@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
    Thread(target=tenant_delete).start()

    def deletion_arrived():
-        env.pageserver.assert_log_contains(
+        assert env.pageserver.log_contains(
            f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
        )

@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
    )

    # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
-    env.pageserver.assert_log_contains(CANCELLED_ERROR)
+    assert env.pageserver.log_contains(CANCELLED_ERROR)
    assert not env.pageserver.log_contains(
        ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
    )
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Arthur Petukhovsky	1df0f69664	Merge pull request #6973 from neondatabase/rc/2024-02-29-manual Release 2024-02-29	2024-02-29 17:26:33 +00:00
Vlad Lazar	970066a914	libs: fix expired token in auth decode test (#6963 ) The test token expired earlier today (1709200879). I regenerated the token, but without an expiration date this time.	2024-02-29 17:23:25 +00:00
Arthur Petukhovsky	1ebd3897c0	Merge pull request #6956 from neondatabase/rc/2024-02-28 Release 2024-02-28	2024-02-29 16:39:52 +00:00
Arthur Petukhovsky	6460beffcd	Merge pull request #6901 from neondatabase/rc/2024-02-26 Release 2024-02-26	2024-02-26 17:08:19 +00:00
John Spray	6f7f8958db	pageserver: only write out legacy tenant config if no generation (#6891 ) ## Problem Previously we always wrote out both legacy and modern tenant config files. The legacy write enabled rollbacks, but we are long past the point where that is needed. We still need the legacy format for situations where someone is running tenants without generations (that will be yanked as well eventually), but we can avoid writing it out at all if we do have a generation number set. We implicitly also avoid writing the legacy config if our mode is Secondary (secondary mode is newer than generations). ## Summary of changes - Make writing legacy tenant config conditional on there being no generation number set.	2024-02-26 10:25:25 +00:00
Christian Schwarz	936a00e077	pageserver: remove two obsolete/unused per-timeline metrics (#6893 ) over-compensating the addition of a new per-timeline metric in https://github.com/neondatabase/neon/pull/6834 part of https://github.com/neondatabase/neon/issues/6737	2024-02-26 09:16:24 +00:00