From d04af08567cc3ff94ff19a2f6b3f7a2a1e3c55d1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 29 Feb 2024 10:00:01 +0000
Subject: [PATCH 01/18] control_plane: storage controller secrets by env
 (#6952)

## Problem

Sometimes folks prefer not to expose secrets as CLI args.

## Summary of changes

- Add ability to load secrets from environment variables.

We can eventually remove the AWS SM code path here if nobody is using it
-- we don't need to maintain three ways to load secrets.
---
 control_plane/attachment_service/src/main.rs | 27 +++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index db4f00644f..5b952ae4fc 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -79,13 +79,38 @@ impl Secrets {
         "neon-storage-controller-control-plane-jwt-token";
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
+    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
+    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
+    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
+
+    /// Load secrets from, in order of preference:
+    /// - CLI args if database URL is provided on the CLI
+    /// - Environment variables if DATABASE_URL is set.
+    /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
         match &args.database_url {
             Some(url) => Self::load_cli(url, args),
-            None => Self::load_aws_sm().await,
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
         }
     }
 
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
     async fn load_aws_sm() -> anyhow::Result<Self> {
         let Ok(region) = std::env::var("AWS_REGION") else {
             anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");

From 4d426f6fbe596a12c19b86bbf43313e3452ac73b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Feb 2024 13:26:29 +0200
Subject: [PATCH 02/18] feat: support lazy, queued tenant attaches (#6907)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add off-by-default support for lazy queued tenant activation on attach.
This should be useful on bulk migrations as some tenants will be
activated faster due to operations or endpoint startup. Eventually all
tenants will get activated by reusing the same mechanism we have at
startup (`PageserverConf::concurrent_tenant_warmup`).

The difference to lazy attached tenants to startup ones is that we leave
their initial logical size calculation be triggered by WalReceiver or
consumption metrics.

Fixes: #6315

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/config.rs                  |   6 +-
 pageserver/src/http/openapi_spec.yml      |   6 +
 pageserver/src/http/routes.rs             |  25 ++-
 pageserver/src/tenant.rs                  |  68 ++++----
 pageserver/src/tenant/delete.rs           |   2 +-
 pageserver/src/tenant/mgr.rs              |  12 +-
 test_runner/fixtures/pageserver/http.py   |   9 +-
 test_runner/regress/test_timeline_size.py | 200 ++++++++++++++++++++--
 8 files changed, 255 insertions(+), 73 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index d18b8d6885..0a7172bde2 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -212,9 +212,9 @@ pub struct PageServerConf {
 
     pub log_format: LogFormat,
 
-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
-    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
-    /// loading such tenants, vs. other work in the system.
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
+    ///
+    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
     pub concurrent_tenant_warmup: ConfigurableSemaphore,
 
     /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 5afb3ba63d..19b5fb7e79 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -579,6 +579,12 @@ paths:
         required: false
         schema:
           type: integer
+      - name: lazy
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
     put:
       description: |
         Configures a _tenant location_, that is how a particular pageserver handles
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 12bd21fd7b..9d92fbaee0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -816,13 +816,7 @@ async fn tenant_attach_handler(
 
     let tenant = state
         .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            None,
-            SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
         .await?;
 
     let Some(tenant) = tenant else {
@@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler(
 
     let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
     let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
+    let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler(
     let location_conf =
         LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
+    // lazy==true queues up for activation or jumps the queue like normal when a compute connects,
+    // similar to at startup ordering.
+    let spawn_mode = if lazy {
+        tenant::SpawnMode::Lazy
+    } else {
+        tenant::SpawnMode::Eager
+    };
+
     let attached = state
         .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            flush,
-            tenant::SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
         .await?
         .is_some();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6a63a2adeb..f027e9d4b1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -227,7 +227,11 @@ pub(crate) struct TenantPreload {
 /// When we spawn a tenant, there is a special mode for tenant creation that
 /// avoids trying to read anything from remote storage.
 pub(crate) enum SpawnMode {
-    Normal,
+    /// Activate as soon as possible
+    Eager,
+    /// Lazy activation in the background, with the option to skip the queue if the need comes up
+    Lazy,
+    /// Tenant has been created during the lifetime of this process
     Create,
 }
 
@@ -700,41 +704,37 @@ impl Tenant {
                     .and_then(|x| x.initial_tenant_load_remote.take());
 
                 enum AttachType<'a> {
-                    // During pageserver startup, we are attaching this tenant lazily in the background
-                    Warmup(tokio::sync::SemaphorePermit<'a>),
-                    // During pageserver startup, we are attaching this tenant as soon as we can,
-                    // because a client tried to access it.
+                    /// We are attaching this tenant lazily in the background.
+                    Warmup {
+                        _permit: tokio::sync::SemaphorePermit<'a>,
+                        during_startup: bool
+                    },
+                    /// We are attaching this tenant as soon as we can, because for example an
+                    /// endpoint tried to access it.
                     OnDemand,
-                    // During normal operations after startup, we are attaching a tenant.
+                    /// During normal operations after startup, we are attaching a tenant, and
+                    /// eager attach was requested.
                     Normal,
                 }
 
-                // Before doing any I/O, wait for either or:
-                // - A client to attempt to access to this tenant (on-demand loading)
-                // - A permit to become available in the warmup semaphore (background warmup)
-                //
-                // Some-ness of init_order is how we know if we're attaching during startup or later
-                // in process lifetime.
-                let attach_type = if init_order.is_some() {
+                let attach_type = if matches!(mode, SpawnMode::Lazy) {
+                    // Before doing any I/O, wait for at least one of:
+                    // - A client attempting to access to this tenant (on-demand loading)
+                    // - A permit becoming available in the warmup semaphore (background warmup)
+
                     tokio::select!(
-                        _ = tenant_clone.activate_now_sem.acquire() => {
+                        permit = tenant_clone.activate_now_sem.acquire() => {
+                            let _ = permit.expect("activate_now_sem is never closed");
                             tracing::info!("Activating tenant (on-demand)");
                             AttachType::OnDemand
                         },
-                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            match permit_result {
-                                Ok(p) => {
-                                    tracing::info!("Activating tenant (warmup)");
-                                    AttachType::Warmup(p)
-                                }
-                                Err(_) => {
-                                    // This is unexpected: the warmup semaphore should stay alive
-                                    // for the lifetime of init_order.  Log a warning and proceed.
-                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
-                                    AttachType::Normal
-                                }
+                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
+                            tracing::info!("Activating tenant (warmup)");
+                            AttachType::Warmup {
+                                _permit,
+                                during_startup: init_order.is_some()
                             }
-
                         }
                         _ = tenant_clone.cancel.cancelled() => {
                             // This is safe, but should be pretty rare: it is interesting if a tenant
@@ -749,6 +749,8 @@ impl Tenant {
                         },
                     )
                 } else {
+                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
+                    // concurrent_tenant_warmup queue
                     AttachType::Normal
                 };
 
@@ -756,7 +758,7 @@ impl Tenant {
                     (SpawnMode::Create, _) => {
                         None
                     },
-                    (SpawnMode::Normal, Some(remote_storage)) => {
+                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
                             .preload(remote_storage, task_mgr::shutdown_token())
@@ -769,7 +771,7 @@ impl Tenant {
                             }
                         }
                     }
-                    (SpawnMode::Normal, None) => {
+                    (_, None) => {
                         let _preload_timer = TENANT.preload.start_timer();
                         None
                     }
@@ -828,7 +830,7 @@ impl Tenant {
                 let attached = {
                     let _attach_timer = match mode {
                         SpawnMode::Create => None,
-                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
                     };
                     tenant_clone.attach(preload, mode, &ctx).await
                 };
@@ -850,7 +852,7 @@ impl Tenant {
                 // It also prevents the warmup proccess competing with the concurrency limit on
                 // logical size calculations: if logical size calculation semaphore is saturated,
                 // then warmup will wait for that before proceeding to the next tenant.
-                if let AttachType::Warmup(_permit) = attach_type {
+                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
                     let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                     tracing::info!("Waiting for initial logical sizes while warming up...");
                     while futs.next().await.is_some() {}
@@ -923,7 +925,7 @@ impl Tenant {
                 deleting: false,
                 timelines: HashMap::new(),
             },
-            (None, SpawnMode::Normal) => {
+            (None, _) => {
                 anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
             }
         };
@@ -3769,7 +3771,7 @@ pub(crate) mod harness {
             let preload = tenant
                 .preload(&self.remote_storage, CancellationToken::new())
                 .await?;
-            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 3d138da7af..ffb7206b1e 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
             .expect("cant be stopping or broken");
 
         tenant
-            .attach(preload, super::SpawnMode::Normal, ctx)
+            .attach(preload, super::SpawnMode::Eager, ctx)
             .await
             .context("attach")?;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 8f0f73d4b5..805d44f93d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
             shard_identity,
             Some(init_order.clone()),
             &TENANTS,
-            SpawnMode::Normal,
+            SpawnMode::Lazy,
             &ctx,
         ) {
             Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {
 
                 // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
                 // the caller thinks they're creating but the tenant already existed.  We must switch to
-                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
                 // rather than assuming it to be empty.
-                spawn_mode = SpawnMode::Normal;
+                spawn_mode = SpawnMode::Eager;
             }
             Some(TenantSlot::Secondary(state)) => {
                 info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
             shard_identity,
             None,
             self.tenants,
-            SpawnMode::Normal,
+            SpawnMode::Eager,
             ctx,
         )?;
 
@@ -1521,7 +1521,7 @@ impl TenantManager {
                 *child_shard,
                 child_location_conf,
                 None,
-                SpawnMode::Normal,
+                SpawnMode::Eager,
                 ctx,
             )
             .await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
         shard_identity,
         None,
         &TENANTS,
-        SpawnMode::Normal,
+        SpawnMode::Eager,
         ctx,
     )
     .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index ad3efb5837..b8e20c451f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -286,7 +286,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
 
     def tenant_location_conf(
-        self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        location_conf=dict[str, Any],
+        flush_ms=None,
+        lazy: Optional[bool] = None,
     ):
         body = location_conf.copy()
         body["tenant_id"] = str(tenant_id)
@@ -295,6 +299,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if flush_ms is not None:
             params["flush_ms"] = str(flush_ms)
 
+        if lazy is not None:
+            params["lazy"] = "true" if lazy else "false"
+
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
             json=body,
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 327e5abe26..cbf7059c92 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -2,6 +2,7 @@ import concurrent.futures
 import math
 import random
 import time
+from collections import defaultdict
 from contextlib import closing
 from pathlib import Path
 from typing import Optional
@@ -14,6 +15,7 @@ from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PgBin,
     VanillaPostgres,
     wait_for_last_flush_lsn,
@@ -839,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
 
     # Deleting a stuck tenant should prompt it to go active
+    # in some cases, it has already been activated because it's behind the detach
+    delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False)
+    tenant_ids.remove(delete_tenant_id)
+
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
+    # we detached)
+    wait_until(10, 1, all_active)
+    assert len(get_tenant_states()) == n_tenants - 2
+
+
+def delete_lazy_activating(
+    delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool
+):
+    pageserver_http = pageserver.http_client()
+
+    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+    # on-demand activation was triggered by the tenant deletion
+    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
+
+    if expect_attaching:
+        assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
+
     with concurrent.futures.ThreadPoolExecutor() as executor:
         log.info("Starting background delete")
 
+        def activated_on_demand():
+            assert pageserver.log_contains(log_match) is not None
+
         def delete_tenant():
-            env.pageserver.http_client().tenant_delete(delete_tenant_id)
+            pageserver_http.tenant_delete(delete_tenant_id)
 
         background_delete = executor.submit(delete_tenant)
 
-        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
-        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
-        # on-demand activation was triggered by the tenant deletion
-        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
-
-        def activated_on_demand():
-            assert env.pageserver.log_contains(log_match) is not None
-
         log.info(f"Waiting for activation message '{log_match}'")
         try:
             wait_until(10, 1, activated_on_demand)
@@ -868,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
         # Poll for deletion to complete
         wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
-        tenant_ids.remove(delete_tenant_id)
-
-    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
-    # we detached)
-    wait_until(10, 1, all_active)
-    assert len(get_tenant_states()) == n_tenants - 2
 
 
 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
@@ -939,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     client.configure_failpoints(
         [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
     )
+
+
+def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+
+    # the supporting_second does nothing except queue behind env.initial_tenant
+    # for purposes of showing that eager_tenant breezes past the queue
+    supporting_second, _ = env.neon_cli.create_tenant()
+    eager_tenant, _ = env.neon_cli.create_tenant()
+
+    client = env.pageserver.http_client()
+    client.tenant_location_conf(
+        eager_tenant,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.pageserver.stop()
+
+    # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    tenant_ids = [env.initial_tenant, supporting_second]
+
+    def get_tenant_states() -> dict[str, list[TenantId]]:
+        states = defaultdict(list)
+        for id in tenant_ids:
+            state = client.tenant_status(id)["state"]["slug"]
+            states[state].append(id)
+        return dict(states)
+
+    def one_is_active():
+        states = get_tenant_states()
+        log.info(f"{states}")
+        assert len(states["Active"]) == 1
+
+    wait_until(10, 1, one_is_active)
+
+    def other_is_attaching():
+        states = get_tenant_states()
+        assert len(states["Attaching"]) == 1
+
+    wait_until(10, 1, other_is_attaching)
+
+    def eager_tenant_is_active():
+        resp = client.tenant_status(eager_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id)
+    client.tenant_location_conf(
+        eager_tenant,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": gen,
+        },
+        lazy=False,
+    )
+    wait_until(10, 1, eager_tenant_is_active)
+
+    other_is_attaching()
+
+    client.configure_failpoints(
+        [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")]
+    )
+
+
+@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
+def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
+    # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+
+    # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue
+    lazy_tenant, _ = env.neon_cli.create_tenant()
+
+    client = env.pageserver.http_client()
+    client.tenant_location_conf(
+        lazy_tenant,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.pageserver.stop()
+
+    # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    def initial_tenant_is_active():
+        resp = client.tenant_status(env.initial_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    wait_until(10, 1, initial_tenant_is_active)
+
+    # even though the initial tenant is now active, because it was startup time
+    # attach, it will consume the only permit because logical size calculation
+    # is paused.
+
+    gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id)
+    client.tenant_location_conf(
+        lazy_tenant,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": gen,
+        },
+        lazy=True,
+    )
+
+    def lazy_tenant_is_attaching():
+        resp = client.tenant_status(lazy_tenant)
+        assert resp["state"]["slug"] == "Attaching"
+
+    # paused logical size calculation of env.initial_tenant is keeping it attaching
+    wait_until(10, 1, lazy_tenant_is_attaching)
+
+    for _ in range(5):
+        lazy_tenant_is_attaching()
+        time.sleep(0.5)
+
+    def lazy_tenant_is_active():
+        resp = client.tenant_status(lazy_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    if activation_method == "endpoint":
+        with env.endpoints.create_start("main", tenant_id=lazy_tenant):
+            # starting up the endpoint should make it jump the queue
+            wait_until(10, 1, lazy_tenant_is_active)
+    elif activation_method == "branch":
+        env.neon_cli.create_timeline("second_branch", lazy_tenant)
+        wait_until(10, 1, lazy_tenant_is_active)
+    elif activation_method == "delete":
+        delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
+    else:
+        raise RuntimeError(activation_method)

From 3eb83a0ebbae56acad54190bc71085c7b424fb13 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Feb 2024 15:54:58 +0200
Subject: [PATCH 03/18] Provide appoximation of working set using hyper-log-log
 algorithm in LFC (#6935)

## Summary of changes

Calculate number of unique page accesses at compute.
It can be used to estimate working set size and adjust cache size
(shared_buffers or local file cache).

Approximation is made using HyperLogLog algorithm.
It is performed by local file cache and so is available only when local
file cache is enabled.

This calculation doesn't take in account access to the pages present in
shared buffers, but includes pages available in local file cache.

This information can be retrieved using
approximate_working_set_size(reset bool) function from neon extension.
reset parameter can be used to reset statistic and so collect unique
accesses for the particular interval.

Below is an example of estimating working set size after pgbench -c 10
-S -T 100 -s 10:
```
postgres=# select approximate_working_set_size(false);
 approximate_working_set_size
------------------------------
                        19052
(1 row)

postgres=# select pg_table_size('pgbench_accounts')/8192;
 ?column?
----------
    16402
(1 row)
```


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/file_cache.c                     | 36 ++++++++++++++++++++++
 pgxn/neon/neon--1.2--1.3.sql               |  9 ++++++
 pgxn/neon/neon.control                     |  2 +-
 test_runner/regress/test_neon_extension.py |  2 +-
 5 files changed, 48 insertions(+), 3 deletions(-)
 create mode 100644 pgxn/neon/neon--1.2--1.3.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ef0a79a50c..7ea767ec74 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 11d6f6aec5..25275ef31f 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -25,6 +25,8 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
+#include "common/hashfn.h"
+#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -60,6 +62,7 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 
+#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
 typedef struct FileCacheEntry
@@ -84,6 +87,8 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
+	hyperLogLogState wss_estimation; /* estimation of wroking set size */
+	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
 } FileCacheControl;
 
 static HTAB *lfc_hash;
@@ -232,6 +237,14 @@ lfc_shmem_startup(void)
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
 
+		/* Initialize hyper-log-log structure for estimating working set size */
+		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
+
+		/* We need hashes in shared memory */
+		pfree(lfc_ctl->wss_estimation.hashesArr);
+		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
+
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
@@ -529,6 +542,11 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+	/* Approximate working set */
+	tag.blockNum = blkno;
+	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
 		/* Page is not cached */
@@ -967,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(funcctx);
 }
+
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	int32 dc = -1;
+	if (lfc_size_limit != 0)
+	{
+		bool reset = PG_GETARG_BOOL(0);
+		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
+		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		if (reset)
+			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		LWLockRelease(lfc_lock);
+	}
+	PG_RETURN_INT32(dc);
+}
diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql
new file mode 100644
index 0000000000..9583008777
--- /dev/null
+++ b/pgxn/neon/neon--1.2--1.3.sql
@@ -0,0 +1,9 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
+
+CREATE FUNCTION approximate_working_set_size(reset bool)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_working_set_size'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;
+
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 599b54b2ff..cee2f336f2 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 672f2b495d..1179a3afe9 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.2",)
+            assert cur.fetchone() == ("1.3",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             res = cur.fetchall()
             log.info(res)

From 5984edaecd9c1914fb88f17fcffaeeb7e1d3b1ca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 29 Feb 2024 13:55:38 +0000
Subject: [PATCH 04/18] libs: fix expired token in auth decode test (#6963)

The test token expired earlier today (1709200879). I regenerated the
token, but without an expiration date this time.
---
 libs/utils/src/auth.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 51ab238d77..fbf0dff665 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -206,12 +206,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         //   "scope": "tenant",
         //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
         //   "iss": "neon.controlplane",
-        //   "exp": 1709200879,
         //   "iat": 1678442479
         // }
         // ```
         //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
 
         // Check it can be validated with the public key
         let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);

From 76ab57f33f88ea44de76a4da97cd877ae8acfcc7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 29 Feb 2024 13:51:15 -0500
Subject: [PATCH 05/18] test: disable test_superuser on pg15 (#6972)

ref https://github.com/neondatabase/neon/issues/6969

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_neon_superuser.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index ca8ada4ddb..e0364dd13f 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,9 +1,12 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.utils import wait_until
 
 
+@skip_on_postgres(
+    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
+)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
     env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")

From 502b69b33bbd4ad1b0647e921a9c665249a2cd62 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Feb 2024 20:50:23 +0100
Subject: [PATCH 06/18] refactor(compaction): `RequestContext` shouldn't be
 `Clone`, only `RequestContextAdaptor` uses it (#6961)

Extracted from https://github.com/neondatabase/neon/pull/6953

Part of https://github.com/neondatabase/neon/issues/5899
---
 pageserver/src/tenant/timeline/compaction.rs | 34 ++++++--------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 950459cbf9..914e3948ef 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -75,14 +75,13 @@ impl Timeline {
 
         let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
         let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
-        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
 
         pageserver_compaction::compact_tiered::compact_tiered(
             &mut adaptor,
             end_lsn,
             target_file_size,
             fanout,
-            &ctx_adaptor,
+            ctx,
         )
         .await?;
 
@@ -143,13 +142,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
     type DeltaLayer = ResidentDeltaLayer;
     type ImageLayer = ResidentImageLayer;
 
-    type RequestContext = RequestContextAdaptor;
+    type RequestContext = crate::context::RequestContext;
 
     async fn get_layers(
         &mut self,
         key_range: &Range<Key>,
         lsn_range: &Range<Lsn>,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
         self.flush_updates().await?;
 
@@ -170,7 +169,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         &mut self,
         key_range: &Range<Key>,
         lsn: Lsn,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Range<Key>>> {
         if lsn == self.keyspace.0 {
             Ok(pageserver_compaction::helpers::intersect_keyspace(
@@ -206,7 +205,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         &mut self,
         lsn: Lsn,
         key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         Ok(self.create_image_impl(lsn, key_range, ctx).await?)
     }
@@ -216,7 +215,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         lsn_range: &Range<Lsn>,
         key_range: &Range<Key>,
         input_layers: &[ResidentDeltaLayer],
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
 
@@ -287,7 +286,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
     async fn delete_layer(
         &mut self,
         layer: &OwnArc<PersistentLayerDesc>,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         self.layers_to_delete.push(layer.clone().0);
         Ok(())
@@ -299,7 +298,7 @@ impl TimelineAdaptor {
         &mut self,
         lsn: Lsn,
         key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
 
@@ -361,17 +360,7 @@ impl TimelineAdaptor {
     }
 }
 
-pub struct RequestContextAdaptor(pub RequestContext);
-
-impl std::ops::Deref for RequestContextAdaptor {
-    type Target = RequestContext;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl CompactionRequestContext for RequestContextAdaptor {}
+impl CompactionRequestContext for crate::context::RequestContext {}
 
 #[derive(Debug, Clone)]
 pub struct OwnArc<T>(pub Arc<T>);
@@ -449,10 +438,7 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
 impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
     type DeltaEntry<'a> = DeltaEntry<'a>;
 
-    async fn load_keys<'a>(
-        &self,
-        ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
         self.0.load_keys(ctx).await
     }
 }

From ee93700a0fe5548c391ba8da5f10d5841c8911db Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Feb 2024 22:54:16 +0200
Subject: [PATCH 07/18] dube: timeout individual layer evictions, log progress
 and record metrics (#6131)

Because of bugs evictions could hang and pause disk usage eviction task.
One such bug is known and fixed #6928. Guard each layer eviction with a
modest timeout deeming timeouted evictions as failures, to be
conservative.

In addition, add logging and metrics recording on each eviction
iteration:
- log collection completed with duration and amount of layers
    - per tenant collection time is observed in a new histogram
    - per tenant layer count is observed in a new histogram
- record metric for collected, selected and evicted layer counts
- log if eviction takes more than 10s
- log eviction completion with eviction duration

Additionally remove dead code for which no dead code warnings appeared
in earlier PR.

Follow-up to: #6060.
---
 pageserver/src/disk_usage_eviction_task.rs    | 145 ++++++++---
 pageserver/src/metrics.rs                     |  59 +++++
 pageserver/src/tenant/secondary.rs            |  89 ++++---
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  35 ++-
 .../src/tenant/storage_layer/layer/tests.rs   | 232 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  21 +-
 .../src/tenant/timeline/eviction_task.rs      |  13 +-
 8 files changed, 492 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index b1c6f35704..92c1475aef 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId};
 
 use crate::{
     config::PageServerConf,
+    metrics::disk_usage_based_eviction::METRICS,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         self,
@@ -65,7 +66,6 @@ use crate::{
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
         storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
-        Timeline,
     },
 };
 
@@ -409,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         "running disk usage based eviction due to pressure"
     );
 
-    let candidates =
+    let (candidates, collection_time) = {
+        let started_at = std::time::Instant::now();
         match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
             EvictionCandidates::Cancelled => {
                 return Ok(IterationOutcome::Cancelled);
             }
-            EvictionCandidates::Finished(partitioned) => partitioned,
-        };
+            EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
+        }
+    };
+
+    METRICS.layers_collected.inc_by(candidates.len() as u64);
+
+    tracing::info!(
+        elapsed_ms = collection_time.as_millis(),
+        total_layers = candidates.len(),
+        "collection completed"
+    );
 
     // Debug-log the list of candidates
     let now = SystemTime::now();
@@ -446,9 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
 
-    let selection = select_victims(&candidates, usage_pre);
+    let (evicted_amount, usage_planned) =
+        select_victims(&candidates, usage_pre).into_amount_and_planned();
 
-    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
+    METRICS.layers_selected.inc_by(evicted_amount as u64);
 
     // phase2: evict layers
 
@@ -477,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
             if let Some(next) = next {
                 match next {
                     Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                         usage_assumed.add_available_bytes(file_size);
                     }
-                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                    Ok(Err((
+                        file_size,
+                        EvictionError::NotFound
+                        | EvictionError::Downloaded
+                        | EvictionError::Timeout,
+                    ))) => {
                         evictions_failed.file_sizes += file_size;
                         evictions_failed.count += 1;
                     }
@@ -495,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 
             // calling again when consumed_all is fine as evicted is fused.
             let Some((_partition, candidate)) = evicted.next() else {
-                consumed_all = true;
+                if !consumed_all {
+                    tracing::info!("all evictions started, waiting");
+                    consumed_all = true;
+                }
                 continue;
             };
 
@@ -503,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 EvictionLayer::Attached(layer) => {
                     let file_size = layer.layer_desc().file_size;
                     js.spawn(async move {
-                        layer
-                            .evict_and_wait()
-                            .await
-                            .map(|()| file_size)
-                            .map_err(|e| (file_size, e))
+                        // have a low eviction waiting timeout because our LRU calculations go stale fast;
+                        // also individual layer evictions could hang because of bugs and we do not want to
+                        // pause disk_usage_based_eviction for such.
+                        let timeout = std::time::Duration::from_secs(5);
+
+                        match layer.evict_and_wait(timeout).await {
+                            Ok(()) => Ok(file_size),
+                            Err(e) => Err((file_size, e)),
+                        }
                     });
                 }
                 EvictionLayer::Secondary(layer) => {
@@ -529,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         (usage_assumed, evictions_failed)
     };
 
+    let started_at = std::time::Instant::now();
+
+    let evict_layers = async move {
+        let mut evict_layers = std::pin::pin!(evict_layers);
+
+        let maximum_expected = std::time::Duration::from_secs(10);
+
+        let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
+        let tuple = if let Ok(tuple) = res {
+            tuple
+        } else {
+            let elapsed = started_at.elapsed();
+            tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
+            evict_layers.await
+        };
+
+        let elapsed = started_at.elapsed();
+        tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
+        tuple
+    };
+
+    let evict_layers =
+        evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
+
     let (usage_assumed, evictions_failed) = tokio::select! {
         tuple = evict_layers => { tuple },
         _ = cancel.cancelled() => {
@@ -763,6 +811,8 @@ async fn collect_eviction_candidates(
     eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
+
     // get a snapshot of the list of tenants
     let tenants = tenant::mgr::list_tenants()
         .await
@@ -791,6 +841,8 @@ async fn collect_eviction_candidates(
             continue;
         }
 
+        let started_at = std::time::Instant::now();
+
         // collect layers from all timelines in this tenant
         //
         // If one of the timelines becomes `!is_active()` during the iteration,
@@ -805,6 +857,7 @@ async fn collect_eviction_candidates(
             }
             let info = tl.get_local_layers_for_disk_usage_eviction().await;
             debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+
             tenant_candidates.extend(info.resident_layers.into_iter());
             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
 
@@ -870,7 +923,25 @@ async fn collect_eviction_candidates(
                     (partition, candidate)
                 });
 
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
+
         candidates.extend(tenant_candidates);
+
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
     }
 
     // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -885,11 +956,11 @@ async fn collect_eviction_candidates(
         },
     );
 
-    for secondary_tenant in secondary_tenants {
+    for tenant in secondary_tenants {
         // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
         // to prevent repeated disk usage based evictions from completely draining less often
         // updating secondaries.
-        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+        let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
 
         debug_assert!(
             total_layers >= layer_info.resident_layers.len(),
@@ -897,6 +968,8 @@ async fn collect_eviction_candidates(
             layer_info.resident_layers.len()
         );
 
+        let started_at = std::time::Instant::now();
+
         layer_info
             .resident_layers
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -918,9 +991,27 @@ async fn collect_eviction_candidates(
                     )
                 });
 
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
         candidates.extend(tenant_candidates);
 
         tokio::task::yield_now().await;
+
+        let elapsed = started_at.elapsed();
+
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
     }
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -997,30 +1088,6 @@ impl<U: Usage> VictimSelection<U> {
     }
 }
 
-struct TimelineKey(Arc<Timeline>);
-
-impl PartialEq for TimelineKey {
-    fn eq(&self, other: &Self) -> bool {
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for TimelineKey {}
-
-impl std::hash::Hash for TimelineKey {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        Arc::as_ptr(&self.0).hash(state);
-    }
-}
-
-impl std::ops::Deref for TimelineKey {
-    type Target = Timeline;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.as_ref()
-    }
-}
-
 /// A totally ordered f32 subset we can use with sorting functions.
 pub(crate) mod finite_f32 {
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1749e02c7f..1d894ed8a5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2474,6 +2474,64 @@ pub(crate) mod tenant_throttling {
     }
 }
 
+pub(crate) mod disk_usage_based_eviction {
+    use super::*;
+
+    pub(crate) struct Metrics {
+        pub(crate) tenant_collection_time: Histogram,
+        pub(crate) tenant_layer_count: Histogram,
+        pub(crate) layers_collected: IntCounter,
+        pub(crate) layers_selected: IntCounter,
+        pub(crate) layers_evicted: IntCounter,
+    }
+
+    impl Default for Metrics {
+        fn default() -> Self {
+            let tenant_collection_time = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
+                "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
+                vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
+            )
+            .unwrap();
+
+            let tenant_layer_count = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collected_layers",
+                "Amount of layers gathered from a tenant",
+                vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
+            )
+            .unwrap();
+
+            let layers_collected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_collected_layers_total",
+                "Amount of layers collected"
+            )
+            .unwrap();
+
+            let layers_selected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_select_layers_total",
+                "Amount of layers selected"
+            )
+            .unwrap();
+
+            let layers_evicted = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_evicted_layers_total",
+                "Amount of layers successfully evicted"
+            )
+            .unwrap();
+
+            Self {
+                tenant_collection_time,
+                tenant_layer_count,
+                layers_collected,
+                layers_selected,
+                layers_evicted,
+            }
+        }
+    }
+
+    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //
@@ -2508,6 +2566,7 @@ pub fn preinitialize_metrics() {
     Lazy::force(&TENANT_MANAGER);
 
     Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
+    Lazy::force(&disk_usage_based_eviction::METRICS);
 
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index c466ac0c24..14e88b836e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;
 
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
-use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};
+use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
 
 enum DownloadCommand {
     Download(TenantShardId),
@@ -121,6 +121,10 @@ impl SecondaryTenant {
         })
     }
 
+    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
+        self.tenant_shard_id
+    }
+
     pub(crate) async fn shutdown(&self) {
         self.cancel.cancel();
 
@@ -164,16 +168,17 @@ impl SecondaryTenant {
         self.detail.lock().unwrap().get_layers_for_eviction(self)
     }
 
+    /// Cancellation safe, but on cancellation the eviction will go through
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
     pub(crate) async fn evict_layer(
-        &self,
+        self: &Arc<Self>,
         conf: &PageServerConf,
         timeline_id: TimelineId,
         name: LayerFileName,
     ) {
         debug_assert_current_span_has_tenant_id();
 
-        let _guard = match self.gate.enter() {
+        let guard = match self.gate.enter() {
             Ok(g) => g,
             Err(_) => {
                 tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -187,35 +192,57 @@ impl SecondaryTenant {
             .timeline_path(&self.tenant_shard_id, &timeline_id)
             .join(name.file_name());
 
-        // We tolerate ENOENT, because between planning eviction and executing
-        // it, the secondary downloader could have seen an updated heatmap that
-        // resulted in a layer being deleted.
-        // Other local I/O errors are process-fatal: these should never happen.
-        tokio::fs::remove_file(path)
-            .await
-            .or_else(fs_ext::ignore_not_found)
-            .fatal_err("Deleting layer during eviction");
+        let this = self.clone();
 
-        // Update the timeline's state.  This does not have to be synchronized with
-        // the download process, because:
-        // - If downloader is racing with us to remove a file (e.g. because it is
-        //   removed from heatmap), then our mutual .remove() operations will both
-        //   succeed.
-        // - If downloader is racing with us to download the object (this would require
-        //   multiple eviction iterations to race with multiple download iterations), then
-        //   if we remove it from the state, the worst that happens is the downloader
-        //   downloads it again before re-inserting, or we delete the file but it remains
-        //   in the state map (in which case it will be downloaded if this secondary
-        //   tenant transitions to attached and tries to access it)
-        //
-        // The important assumption here is that the secondary timeline state does not
-        // have to 100% match what is on disk, because it's a best-effort warming
-        // of the cache.
-        let mut detail = self.detail.lock().unwrap();
-        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-            timeline_detail.on_disk_layers.remove(&name);
-            timeline_detail.evicted_at.insert(name, now);
-        }
+        // spawn it to be cancellation safe
+        tokio::task::spawn_blocking(move || {
+            let _guard = guard;
+            // We tolerate ENOENT, because between planning eviction and executing
+            // it, the secondary downloader could have seen an updated heatmap that
+            // resulted in a layer being deleted.
+            // Other local I/O errors are process-fatal: these should never happen.
+            let deleted = std::fs::remove_file(path);
+
+            let not_found = deleted
+                .as_ref()
+                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
+
+            let deleted = if not_found {
+                false
+            } else {
+                deleted
+                    .map(|()| true)
+                    .fatal_err("Deleting layer during eviction")
+            };
+
+            if !deleted {
+                // skip updating accounting and putting perhaps later timestamp
+                return;
+            }
+
+            // Update the timeline's state.  This does not have to be synchronized with
+            // the download process, because:
+            // - If downloader is racing with us to remove a file (e.g. because it is
+            //   removed from heatmap), then our mutual .remove() operations will both
+            //   succeed.
+            // - If downloader is racing with us to download the object (this would require
+            //   multiple eviction iterations to race with multiple download iterations), then
+            //   if we remove it from the state, the worst that happens is the downloader
+            //   downloads it again before re-inserting, or we delete the file but it remains
+            //   in the state map (in which case it will be downloaded if this secondary
+            //   tenant transitions to attached and tries to access it)
+            //
+            // The important assumption here is that the secondary timeline state does not
+            // have to 100% match what is on disk, because it's a best-effort warming
+            // of the cache.
+            let mut detail = this.detail.lock().unwrap();
+            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+                timeline_detail.on_disk_layers.remove(&name);
+                timeline_detail.evicted_at.insert(name, now);
+            }
+        })
+        .await
+        .expect("secondary eviction should not have panicked");
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9de820912e..299950cc21 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -72,7 +72,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct ValueReconstructState {
     pub records: Vec<(Lsn, NeonWalRecord)>,
     pub img: Option<(Lsn, Bytes)>,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 13c9e5c989..247dd1a8e4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
-use std::time::SystemTime;
+use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
@@ -208,10 +208,15 @@ impl Layer {
     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     ///
+    /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
+    /// will happen regardless the future returned by this method completing unless there is a
+    /// read access (currently including [`Layer::keep_resident`]) before eviction gets to
+    /// complete.
+    ///
     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
-        self.0.evict_and_wait().await
+    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
+        self.0.evict_and_wait(timeout).await
     }
 
     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
@@ -363,7 +368,7 @@ impl Layer {
     ///
     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     /// separatedly.
-    #[cfg(feature = "testing")]
+    #[cfg(any(feature = "testing", test))]
     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
         let mut rx = self.0.status.subscribe();
 
@@ -632,7 +637,7 @@ impl LayerInner {
 
     /// Cancellation safe, however dropping the future and calling this method again might result
     /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
         use tokio::sync::broadcast::error::RecvError;
 
         assert!(self.have_remote_client);
@@ -652,16 +657,22 @@ impl LayerInner {
         if strong.is_some() {
             // drop the DownloadedLayer outside of the holding the guard
             drop(strong);
+
+            // idea here is that only one evicter should ever get to witness a strong reference,
+            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
+            // cancelled eviction and signal us, like it currently does.
+            //
+            // a second concurrent evict_and_wait will not see a strong reference.
             LAYER_IMPL_METRICS.inc_started_evictions();
         }
 
-        match rx.recv().await {
-            Ok(Status::Evicted) => Ok(()),
-            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
-            Err(RecvError::Closed) => {
+        match tokio::time::timeout(timeout, rx.recv()).await {
+            Ok(Ok(Status::Evicted)) => Ok(()),
+            Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
+            Ok(Err(RecvError::Closed)) => {
                 unreachable!("sender cannot be dropped while we are in &self method")
             }
-            Err(RecvError::Lagged(_)) => {
+            Ok(Err(RecvError::Lagged(_))) => {
                 // this is quite unlikely, but we are blocking a lot in the async context, so
                 // we might be missing this because we are stuck on a LIFO slot on a thread
                 // which is busy blocking for a 1TB database create_image_layers.
@@ -674,6 +685,7 @@ impl LayerInner {
                     None => Ok(()),
                 }
             }
+            Err(_timeout) => Err(EvictionError::Timeout),
         }
     }
 
@@ -1195,6 +1207,9 @@ pub(crate) enum EvictionError {
     /// Evictions must always lose to downloads in races, and this time it happened.
     #[error("layer was downloaded instead")]
     Downloaded,
+
+    #[error("eviction did not happen within timeout")]
+    Timeout,
 }
 
 /// Error internal to the [`LayerInner::get_or_maybe_download`]
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 01c62b6f83..b43534efd4 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,13 +1,173 @@
 use futures::StreamExt;
+use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
+use tracing::Instrument;
 use utils::{
     completion::{self, Completion},
     id::TimelineId,
 };
 
 use super::*;
-use crate::task_mgr::BACKGROUND_RUNTIME;
-use crate::tenant::harness::TenantHarness;
+use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
+use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
+
+/// Used in tests to advance a future to wanted await point, and not futher.
+const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
+
+/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
+/// timeout uses to advance futures.
+const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
+
+/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
+#[tokio::test]
+async fn smoke_test() {
+    let handle = BACKGROUND_RUNTIME.handle();
+
+    let h = TenantHarness::create("smoke_test").unwrap();
+    let span = h.span();
+    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+    let (tenant, _) = h.load().await;
+
+    let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // all layers created at pageserver are like `layer`, initialized with strong
+    // Arc<DownloadedLayer>.
+
+    let img_before = {
+        let mut data = ValueReconstructState::default();
+        layer
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .await
+            .unwrap();
+        data.img
+            .take()
+            .expect("tenant harness writes the control file")
+    };
+
+    // important part is evicting the layer, which can be done when there are no more ResidentLayer
+    // instances -- there currently are none, only two `Layer` values, one in the layermap and on
+    // in scope.
+    layer.evict_and_wait(FOREVER).await.unwrap();
+
+    // double-evict returns an error, which is valid if both eviction_task and disk usage based
+    // eviction would both evict the same layer at the same time.
+
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound));
+
+    // on accesses when the layer is evicted, it will automatically be downloaded.
+    let img_after = {
+        let mut data = ValueReconstructState::default();
+        layer
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .instrument(download_span.clone())
+            .await
+            .unwrap();
+        data.img.take().unwrap()
+    };
+
+    assert_eq!(img_before, img_after);
+
+    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
+    //
+    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
+    // artificially slow it down.
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+    match layer
+        .evict_and_wait(std::time::Duration::ZERO)
+        .await
+        .unwrap_err()
+    {
+        EvictionError::Timeout => {
+            // expected, but note that the eviction is "still ongoing"
+            helper.release().await;
+            // exhaust spawn_blocking pool to ensure it is now complete
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
+                .await;
+        }
+        other => unreachable!("{other:?}"),
+    }
+
+    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
+    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
+    // downloaded locally.
+    let none = layer.keep_resident().await.unwrap();
+    assert!(
+        none.is_none(),
+        "Expected none, because eviction removed the local file, found: {none:?}"
+    );
+
+    // plain downloading is rarely needed
+    layer
+        .download_and_keep_resident()
+        .instrument(download_span)
+        .await
+        .unwrap();
+
+    // last important part is deletion on drop: gc and compaction use it for compacted L0 layers
+    // or fully garbage collected layers. deletion means deleting the local file, and scheduling a
+    // deletion of the already unlinked from index_part.json remote file.
+    //
+    // marking a layer to be deleted on drop is irreversible; there is no technical reason against
+    // reversiblity, but currently it is not needed so it is not provided.
+    layer.delete_on_drop();
+
+    let path = layer.local_path().to_owned();
+
+    // wait_drop produces an unconnected to Layer future which will resolve when the
+    // LayerInner::drop has completed.
+    let mut wait_drop = std::pin::pin!(layer.wait_drop());
+
+    // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
+    // until here
+    tokio::time::pause();
+    tokio::time::timeout(ADVANCE, &mut wait_drop)
+        .await
+        .expect_err("should had timed out because two strong references exist");
+
+    tokio::fs::metadata(&path)
+        .await
+        .expect("the local layer file still exists");
+
+    let rtc = timeline.remote_client.as_ref().unwrap();
+
+    {
+        let layers = &[layer];
+        let mut g = timeline.layers.write().await;
+        g.finish_gc_timeline(layers);
+        // this just updates the remote_physical_size for demonstration purposes
+        rtc.schedule_gc_update(layers).unwrap();
+    }
+
+    // when strong references are dropped, the file is deleted and remote deletion is scheduled
+    wait_drop.await;
+
+    let e = tokio::fs::metadata(&path)
+        .await
+        .expect_err("the local file is deleted");
+    assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
+
+    rtc.wait_completion().await.unwrap();
+
+    assert_eq!(rtc.get_remote_physical_size(), 0);
+}
 
 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
 /// time. Now both of them complete per Arc drop semantics.
@@ -41,10 +201,10 @@ async fn evict_and_wait_on_wanted_deleted() {
     let resident = layer.keep_resident().await.unwrap();
 
     {
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
         // drive the future to await on the status channel
-        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
             .await
             .expect_err("should had been a timeout since we are holding the layer resident");
 
@@ -115,10 +275,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     let resident = layer.keep_resident().await.unwrap();
 
-    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
     // drive the future to await on the status channel
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
         .expect_err("should had been a timeout since we are holding the layer resident");
     assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
@@ -138,7 +298,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     // because the keep_resident check alters wanted evicted without sending a message, we will
     // never get completed
-    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
         .expect("no timeout, because keep_resident re-initialized")
         .expect_err("eviction should not have succeeded because re-initialized");
@@ -158,9 +318,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
             .sum::<u64>()
     );
 
-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+    // advance to the wait on the queue
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
         .await
         .expect_err("timeout because spawn_blocking is clogged");
 
@@ -171,7 +332,12 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     helper.release().await;
 
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+    // the second_eviction gets to run here
+    //
+    // synchronize to be *strictly* after the second_eviction spawn_blocking run
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
         .await
         .expect("eviction goes through now that spawn_blocking is unclogged")
         .expect("eviction should succeed, because version matches");
@@ -261,3 +427,49 @@ impl SpawnBlockingPoolHelper {
             .await
     }
 }
+
+#[test]
+fn spawn_blocking_pool_helper_actually_works() {
+    // create a custom runtime for which we know and control how many blocking threads it has
+    //
+    // because the amount is not configurable for our helper, expect the same amount as
+    // BACKGROUND_RUNTIME using the tokio defaults would have.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(512)
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let handle = rt.handle();
+
+    rt.block_on(async move {
+        // this will not return until all threads are spun up and actually executing the code
+        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
+        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+        println!("consumed");
+
+        let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
+            // this will not get to run before we release
+        }));
+
+        println!("spawned");
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
+            .await
+            .expect_err("the task should not have gotten to run yet");
+
+        println!("tried to join");
+
+        consumed.release().await;
+
+        println!("released");
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), jh)
+            .await
+            .expect("no timeout")
+            .expect("no join error");
+
+        println!("joined");
+    });
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fa5e7b3685..206f20306e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1512,10 +1512,14 @@ impl Timeline {
             return Ok(None);
         };
 
-        match local_layer.evict_and_wait().await {
+        // curl has this by default
+        let timeout = std::time::Duration::from_secs(120);
+
+        match local_layer.evict_and_wait(timeout).await {
             Ok(()) => Ok(Some(true)),
             Err(EvictionError::NotFound) => Ok(Some(false)),
             Err(EvictionError::Downloaded) => Ok(Some(false)),
+            Err(EvictionError::Timeout) => Ok(Some(false)),
         }
     }
 }
@@ -5157,8 +5161,7 @@ mod tests {
         let harness =
             TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
 
-        let ctx = any_context();
-        let tenant = harness.do_try_load(&ctx).await.unwrap();
+        let (tenant, ctx) = harness.load().await;
         let timeline = tenant
             .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
             .await
@@ -5172,8 +5175,10 @@ mod tests {
             .expect("should had been resident")
             .drop_eviction_guard();
 
-        let first = async { layer.evict_and_wait().await };
-        let second = async { layer.evict_and_wait().await };
+        let forever = std::time::Duration::from_secs(120);
+
+        let first = layer.evict_and_wait(forever);
+        let second = layer.evict_and_wait(forever);
 
         let (first, second) = tokio::join!(first, second);
 
@@ -5192,12 +5197,6 @@ mod tests {
         }
     }
 
-    fn any_context() -> crate::context::RequestContext {
-        use crate::context::*;
-        use crate::task_mgr::*;
-        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
-    }
-
     async fn find_some_layer(timeline: &Timeline) -> Layer {
         let layers = timeline.layers.read().await;
         let desc = layers
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 008f9482c4..dd603135d2 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -204,6 +204,7 @@ impl Timeline {
             evicted: usize,
             errors: usize,
             not_evictable: usize,
+            timeouts: usize,
             #[allow(dead_code)]
             skipped_for_shutdown: usize,
         }
@@ -267,7 +268,11 @@ impl Timeline {
                 let layer = guard.drop_eviction_guard();
                 if no_activity_for > p.threshold {
                     // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait().await });
+                    js.spawn(async move {
+                        layer
+                            .evict_and_wait(std::time::Duration::from_secs(5))
+                            .await
+                    });
                     stats.candidates += 1;
                 }
             }
@@ -280,6 +285,9 @@ impl Timeline {
                     Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                         stats.not_evictable += 1;
                     }
+                    Ok(Err(EvictionError::Timeout)) => {
+                        stats.timeouts += 1;
+                    }
                     Err(je) if je.is_cancelled() => unreachable!("not used"),
                     Err(je) if je.is_panic() => {
                         /* already logged */
@@ -295,7 +303,8 @@ impl Timeline {
             stats = join_all => {
                 if stats.candidates == stats.not_evictable {
                     debug!(stats=?stats, "eviction iteration complete");
-                } else if stats.errors > 0 || stats.not_evictable > 0 {
+                } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
+                    // reminder: timeouts are not eviction cancellations
                     warn!(stats=?stats, "eviction iteration complete");
                 } else {
                     info!(stats=?stats, "eviction iteration complete");

From e9e77ee744298f4a79ec24734ffd5d76ddb83d02 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 1 Mar 2024 10:45:39 +0100
Subject: [PATCH 08/18] tests: add optional cursor to `log_contains` + fix
 truthiness issues in callers (#6960)

Extracted from https://github.com/neondatabase/neon/pull/6953

Part of https://github.com/neondatabase/neon/issues/5899

Core Change
-----------

In #6953, we need the ability to scan the log _after_ a specific line
and ignore anything before that line.

This PR changes `log_contains` to returns a tuple of `(matching line,
cursor)`.
Hand that cursor to a subsequent `log_contains` call to search the log
for the next occurrence of the pattern.

Other Changes
-------------

- Inspect all the callsites of `log_contains` to handle the new tuple
return type.
- Above inspection unveiled many callers aren't using `assert
log_contains(...) is not None` but some weaker version of the code that
breaks if `log_contains` ever returns a not-None but falsy value. Fix
that.
- Above changes unveiled that `test_remote_storage_upload_queue_retries`
was using `wait_until` incorrectly; after fixing the usage, I had to
raise the `wait_until` timeout. So, maybe this will fix its flakiness.
---
 test_runner/fixtures/neon_fixtures.py         | 27 ++++++++--
 test_runner/fixtures/pageserver/utils.py      |  6 +--
 test_runner/fixtures/utils.py                 | 19 ++++++-
 .../regress/test_attach_tenant_config.py      |  9 ++--
 .../regress/test_disk_usage_eviction.py       | 20 ++++---
 test_runner/regress/test_duplicate_layers.py  |  2 +-
 .../regress/test_layers_from_future.py        | 11 ++--
 test_runner/regress/test_logging.py           |  2 +-
 .../regress/test_pageserver_generations.py    |  2 +-
 test_runner/regress/test_remote_storage.py    | 52 ++++++++++---------
 test_runner/regress/test_sharding_service.py  |  4 +-
 test_runner/regress/test_tenant_delete.py     | 12 ++---
 test_runner/regress/test_tenant_detach.py     |  4 +-
 test_runner/regress/test_tenant_relocation.py |  4 +-
 .../test_tenants_with_remote_storage.py       |  4 +-
 .../regress/test_threshold_based_eviction.py  |  4 +-
 test_runner/regress/test_timeline_delete.py   | 11 ++--
 17 files changed, 119 insertions(+), 74 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 71e77334a1..b933d391ab 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2180,6 +2180,11 @@ class NeonAttachmentService(MetricsGetter):
         self.stop(immediate=True)
 
 
+@dataclass
+class LogCursor:
+    _line_no: int
+
+
 class NeonPageserver(PgProtocol):
     """
     An object representing a running pageserver.
@@ -2343,7 +2348,18 @@ class NeonPageserver(PgProtocol):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
-    def log_contains(self, pattern: str) -> Optional[str]:
+    def assert_log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Tuple[str, LogCursor]:
+        """Convenient for use inside wait_until()"""
+
+        res = self.log_contains(pattern, offset=offset)
+        assert res is not None
+        return res
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
         """Check that the pageserver log contains a line that matches the given regex"""
         logfile = self.workdir / "pageserver.log"
         if not logfile.exists():
@@ -2357,12 +2373,17 @@ class NeonPageserver(PgProtocol):
         # no guarantee it is already present in the log file. This hasn't
         # been a problem in practice, our python tests are not fast enough
         # to hit that race condition.
+        skip_until_line_no = 0 if offset is None else offset._line_no
+        cur_line_no = 0
         with logfile.open("r") as f:
             for line in f:
+                if cur_line_no < skip_until_line_no:
+                    cur_line_no += 1
+                    continue
                 if contains_re.search(line):
                     # found it!
-                    return line
-
+                    cur_line_no += 1
+                    return (line, LogCursor(cur_line_no))
         return None
 
     def tenant_attach(
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 1415038f69..c600733e41 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -20,7 +20,7 @@ def assert_tenant_state(
     tenant: TenantId,
     expected_state: str,
     message: Optional[str] = None,
-):
+) -> None:
     tenant_status = pageserver_http.tenant_status(tenant)
     log.info(f"tenant_status: {tenant_status}")
     assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
     iterations: int = 20,
     interval: Optional[float] = None,
     **delete_args,
-):
+) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
     wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
 
@@ -302,7 +302,7 @@ def assert_prefix_empty(
     remote_storage: Optional[RemoteStorage],
     prefix: Optional[str] = None,
     allowed_postfix: Optional[str] = None,
-):
+) -> None:
     assert remote_storage is not None
     response = list_prefix(remote_storage, prefix)
     keys = response["KeyCount"]
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 91f33e1196..7fc3bae3af 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -369,7 +369,12 @@ def start_in_background(
         return spawned_process
 
 
-def wait_until(number_of_iterations: int, interval: float, func: Fn):
+WaitUntilRet = TypeVar("WaitUntilRet")
+
+
+def wait_until(
+    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
     last return value from the function.
@@ -387,6 +392,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
     raise Exception("timed out while waiting for %s" % func) from last_exception
 
 
+def assert_eq(a, b) -> None:
+    assert a == b
+
+
+def assert_gt(a, b) -> None:
+    assert a > b
+
+
+def assert_ge(a, b) -> None:
+    assert a >= b
+
+
 def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     """
     Fast way to populate data.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 6cae663842..7fbce6a10c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -63,10 +63,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
         ]
     )
 
-    def log_contains_bad_request():
-        env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
-
-    wait_until(50, 0.1, log_contains_bad_request)
+    wait_until(
+        50,
+        0.1,
+        lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
+    )
 
 
 def test_null_body(negative_env: NegativeTests):
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index eb4e370ea7..b83545216d 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -200,7 +200,7 @@ class EvictionEnv:
                 tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
 
         def statvfs_called():
-            assert pageserver.log_contains(".*running mocked statvfs.*")
+            pageserver.assert_log_contains(".*running mocked statvfs.*")
 
         # we most likely have already completed multiple runs
         wait_until(10, 1, statvfs_called)
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
     assert actual_change >= target, "eviction must always evict more than target"
 
     time.sleep(1)  # give log time to flush
-    assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
+    env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
+    env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
     env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
 
 
@@ -801,10 +801,9 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    def relieved_log_message():
-        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    wait_until(
+        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+    )
 
     def less_than_max_usage_pct():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
@@ -845,10 +844,9 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    def relieved_log_message():
-        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    wait_until(
+        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+    )
 
     def more_than_min_avail_bytes_freed():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index 224e6f50c7..cb4fa43be7 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
 
     time.sleep(10)  # let compaction to be performed
-    assert env.pageserver.log_contains("compact-level0-phase1-return-same")
+    env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
 
 
 def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 999e077e45..9da47b9fd3 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -184,10 +184,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     # NB: the layer file is unlinked index part now, but, because we made the delete
     # operation stuck, the layer file itself is still in the remote_storage
-    def delete_at_pause_point():
-        assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
-
-    wait_until(10, 0.5, delete_at_pause_point)
+    wait_until(
+        10,
+        0.5,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*at failpoint.*{failpoint_name}"
+        ),
+    )
     future_layer_path = env.pageserver_remote_storage.remote_layer_path(
         tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
     )
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index d62b5e531c..bfffad7572 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
     def assert_logged():
         if not log_expected:
             return
-        assert env.pageserver.log_contains(f".*{msg_id}.*")
+        env.pageserver.assert_log_contains(f".*{msg_id}.*")
 
     wait_until(10, 0.5, assert_logged)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 1070d06ed0..89fc48a49f 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(
 
     main_pageserver.start()
 
-    def assert_deletions_submitted(n: int):
+    def assert_deletions_submitted(n: int) -> None:
         assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
 
     # After restart, issue a flush to kick the deletion frontend to do recovery.
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 73ebe0a76f..f8a0bef954 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -28,7 +28,14 @@ from fixtures.remote_storage import (
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import print_gc_result, query_scalar, wait_until
+from fixtures.utils import (
+    assert_eq,
+    assert_ge,
+    assert_gt,
+    print_gc_result,
+    query_scalar,
+    wait_until,
+)
 from requests import ReadTimeout
 
 
@@ -120,10 +127,10 @@ def test_remote_storage_backup_and_restore(
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
@@ -292,9 +299,9 @@ def test_remote_storage_upload_queue_retries(
     print_gc_result(gc_result)
     assert gc_result["layers_removed"] > 0
 
-    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-    wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # let all future operations queue up
     configure_storage_sync_failpoints("return")
@@ -322,17 +329,17 @@ def test_remote_storage_upload_queue_retries(
     churn_while_failpoints_active_thread.start()
 
     # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
+    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
     # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
-    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-    wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
@@ -884,26 +891,23 @@ def wait_upload_queue_empty(
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
+        ),
     )
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
+        ),
     )
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
+        ),
     )
 
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6ed49d7fd6..c8224c1c67 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
     # Marking a pageserver offline should migrate tenants away from it.
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
-    def node_evacuated(node_id: int):
+    def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
@@ -405,7 +405,7 @@ def test_sharding_service_compute_hook(
 
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
-    def node_evacuated(node_id: int):
+    def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, [env.initial_tenant])
         assert counts[node_id] == 0
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 8c7d332e1d..c4b4e5fb77 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
         return ps_http.tenant_delete(tenant_id)
 
     def hit_remove_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
 
     def hit_run_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
         background_200_req = executor.submit(delete_tenant)
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
     Thread(target=timeline_create).start()
 
     def hit_initdb_upload_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
     wait_until(100, 0.1, hit_initdb_upload_failpoint)
 
     def creation_connection_timed_out():
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             "POST.*/timeline.* request was dropped before completing"
         )
 
@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
     Thread(target=tenant_delete).start()
 
     def deletion_arrived():
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
         )
 
@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
     )
 
     # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
-    assert env.pageserver.log_contains(CANCELLED_ERROR)
+    env.pageserver.assert_log_contains(CANCELLED_ERROR)
     assert not env.pageserver.log_contains(
         ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
     )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 4752699abb..d3f24cb06e 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index b70131472a..9def3ad1c2 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -495,7 +495,7 @@ def test_emergency_relocate_with_branches_slow_replay(
         assert cur.fetchall() == [("before pause",), ("after pause",)]
 
     # Sanity check that the failpoint was reached
-    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
     assert time.time() - before_attach_time > 5
 
     # Clean up
@@ -632,7 +632,7 @@ def test_emergency_relocate_with_branches_createdb(
         assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
 
     # Sanity check that the failpoint was reached
-    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
     assert time.time() - before_attach_time > 5
 
     # Clean up
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 1c693a0df5..d16978d02a 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -147,10 +147,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 5f72cfd747..7bf49a0874 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -179,6 +179,6 @@ def test_threshold_based_eviction(
     assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized"
     assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident"
 
-    assert env.pageserver.log_contains(
-        metrics_refused_log_line
+    assert (
+        env.pageserver.log_contains(metrics_refused_log_line) is not None
     ), "ensure the metrics collection worker ran"
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index a6a6fb47cc..795110d90b 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     assert timeline_path.exists()
 
     # retry deletes when compaction or gc is running in pageserver
+    # TODO: review whether this wait_until is actually necessary, we do an await() internally
     wait_until(
         number_of_iterations=3,
         interval=0.2,
@@ -531,7 +532,7 @@ def test_concurrent_timeline_delete_stuck_on(
     try:
 
         def first_call_hit_failpoint():
-            assert env.pageserver.log_contains(
+            env.pageserver.assert_log_contains(
                 f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
             )
 
@@ -602,7 +603,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
 
     def hit_failpoint():
-        assert env.pageserver.log_contains(at_failpoint_log_message)
+        env.pageserver.assert_log_contains(at_failpoint_log_message)
 
     wait_until(50, 0.1, hit_failpoint)
 
@@ -612,7 +613,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.append(hangup_log_message)
 
     def got_hangup_log_message():
-        assert env.pageserver.log_contains(hangup_log_message)
+        env.pageserver.assert_log_contains(hangup_log_message)
 
     wait_until(50, 0.1, got_hangup_log_message)
 
@@ -624,7 +625,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
 
     def first_request_finished():
         message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
-        assert env.pageserver.log_contains(message)
+        env.pageserver.assert_log_contains(message)
 
     wait_until(50, 0.1, first_request_finished)
 
@@ -759,7 +760,7 @@ def test_delete_orphaned_objects(
 
     for orphan in orphans:
         assert not orphan.exists()
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             f"deleting a file not referenced from index_part.json name={orphan.stem}"
         )
 

From 7ba50708e3450b501806568d5f37cd5e20d609fd Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Fri, 1 Mar 2024 13:29:08 +0100
Subject: [PATCH 09/18] Testcase for neon extension function
 approximate_working_set_size() (#6980)

## Problem

PR https://github.com/neondatabase/neon/pull/6935 introduced a new
function in neon extension:

approximate_working_set_size

This test case verifies its working correctly.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../test_lfc_working_set_approximation.py     | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 test_runner/regress/test_lfc_working_set_approximation.py

diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
new file mode 100644
index 0000000000..a6f05fe0f7
--- /dev/null
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import query_scalar
+
+
+def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    cache_dir = Path(env.repo_dir) / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+
+    branchname = "test_approximate_working_set_size"
+    env.neon_cli.create_branch(branchname, "empty")
+    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    endpoint = env.endpoints.create_start(
+        branchname,
+        config_lines=[
+            "shared_buffers='1MB'",
+            f"neon.file_cache_path='{cache_dir}/file.cache'",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+    cur.execute("create extension neon")
+
+    log.info(f"preparing some data in {endpoint.connstr()}")
+
+    ddl = """
+CREATE TABLE pgbench_accounts (
+    aid bigint NOT NULL,
+    bid integer,
+    abalance integer,
+    filler character(84),
+    -- more web-app like columns
+    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
+    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
+)
+WITH (fillfactor='100');
+"""
+
+    cur.execute(ddl)
+    # prepare index access below
+    cur.execute(
+        "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
+    )
+    cur.execute(
+        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
+    )
+    # ensure correct query plans and stats
+    cur.execute("vacuum ANALYZE pgbench_accounts")
+    # determine table size - working set should approximate table size after sequential scan
+    pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
+    log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
+    cur.execute("select approximate_working_set_size(true)")
+    cur.execute(
+        'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
+    )
+    # verify working set size after sequential scan matches table size and reset working set for next test
+    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
+    assert pages * 0.8 < blocks < pages * 1.2
+    # run a few point queries with index lookup
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =   4242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =  54242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
+    # verify working set size after some index access of a few select pages only
+    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    log.info(f"working set size after some index access of a few select pages only {blocks}")
+    assert blocks < 10

From f8bdce101542ace882cf891f001f53c702a9685b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 13:26:45 +0000
Subject: [PATCH 10/18] pageserver: fix duplicate shard_id in span (#6981)

## Problem

shard_id in span is repeated:
- https://github.com/neondatabase/neon/issues/6723

Closes: #6723

## Summary of changes

- Only add shard_id to the span when fetching a cached timeline, as it
is already added when loading an uncached timeline.
---
 pageserver/src/page_service.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 11eb512750..cd9c48f9af 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1115,7 +1115,10 @@ impl PageServerHandler {
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
         let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => tl,
+            Ok(tl) => {
+                set_tracing_field_shard_id(tl);
+                tl
+            }
             Err(key) => {
                 match self
                     .load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1140,9 +1143,6 @@ impl PageServerHandler {
             }
         };
 
-        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
-        set_tracing_field_shard_id(timeline);
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn);

From 5ab10d051d28b930b81ef3b712a5f13de695285a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 1 Mar 2024 16:04:39 +0200
Subject: [PATCH 11/18] metrics: record more details of the responding (#6979)

On eu-west-1 during benchmarks we sometimes lose samples. Add more time
measurements.
---
 libs/utils/src/http/endpoint.rs | 48 +++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 550ab10700..3c71628870 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -156,6 +156,10 @@ pub struct ChannelWriter {
     buffer: BytesMut,
     pub tx: mpsc::Sender<std::io::Result<Bytes>>,
     written: usize,
+    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
+    /// buffer because we cannot know anything about that, but this should allow us to understand
+    /// the actual time taken without the time spent `std::thread::park`ed.
+    wait_time: std::time::Duration,
 }
 
 impl ChannelWriter {
@@ -168,6 +172,7 @@ impl ChannelWriter {
             buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
             tx,
             written: 0,
+            wait_time: std::time::Duration::ZERO,
         }
     }
 
@@ -180,6 +185,8 @@ impl ChannelWriter {
         tracing::trace!(n, "flushing");
         let ready = self.buffer.split().freeze();
 
+        let wait_started_at = std::time::Instant::now();
+
         // not ideal to call from blocking code to block_on, but we are sure that this
         // operation does not spawn_blocking other tasks
         let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -192,6 +199,9 @@ impl ChannelWriter {
             // sending it to the client.
             Ok(())
         });
+
+        self.wait_time += wait_started_at.elapsed();
+
         if res.is_err() {
             return Err(std::io::ErrorKind::BrokenPipe.into());
         }
@@ -202,6 +212,10 @@ impl ChannelWriter {
     pub fn flushed_bytes(&self) -> usize {
         self.written
     }
+
+    pub fn wait_time(&self) -> std::time::Duration {
+        self.wait_time
+    }
 }
 
 impl std::io::Write for ChannelWriter {
@@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
 
     let span = info_span!("blocking");
     tokio::task::spawn_blocking(move || {
+        // there are situations where we lose scraped metrics under load, try to gather some clues
+        // since all nodes are queried this, keep the message count low.
+        let spawned_at = std::time::Instant::now();
+
         let _span = span.entered();
+
         let metrics = metrics::gather();
+
+        let gathered_at = std::time::Instant::now();
+
         let res = encoder
             .encode(&metrics, &mut writer)
             .and_then(|_| writer.flush().map_err(|e| e.into()));
 
+        // this instant is not when we finally got the full response sent, sending is done by hyper
+        // in another task.
+        let encoded_at = std::time::Instant::now();
+
+        let spawned_in = spawned_at - started_at;
+        let collected_in = gathered_at - spawned_at;
+        // remove the wait time here in case the tcp connection was clogged
+        let encoded_in = encoded_at - gathered_at - writer.wait_time();
+        let total = encoded_at - started_at;
+
         match res {
             Ok(()) => {
                 tracing::info!(
                     bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
                     "responded /metrics"
                 );
             }
             Err(e) => {
-                tracing::warn!("failed to write out /metrics response: {e:#}");
+                // there is a chance that this error is not the BrokenPipe we generate in the writer
+                // for "closed connection", but it is highly unlikely.
+                tracing::warn!(
+                    after_bytes = writer.flushed_bytes(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
+                    "failed to write out /metrics response: {e:?}"
+                );
                 // semantics of this error are quite... unclear. we want to error the stream out to
                 // abort the response to somehow notify the client that we failed.
                 //

From 4dbb74b559d09361df09b96a1225d889cb2f577d Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Fri, 1 Mar 2024 15:33:08 +0100
Subject: [PATCH 12/18] new test for LFC stats in explain (#6968)

## Problem

PR https://github.com/neondatabase/neon/pull/6851 implemented new output
in PostgreSQL explain.
this is a test case for the new function.

## Summary of changes

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [no ] Do we need to implement analytics? if so did you add the
relevant metrics to the dashboard?
- [no] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../regress/test_explain_with_lfc_stats.py    | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 test_runner/regress/test_explain_with_lfc_stats.py

diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
new file mode 100644
index 0000000000..5231dedcda
--- /dev/null
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    cache_dir = Path(env.repo_dir) / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+
+    branchname = "test_explain_with_lfc_stats"
+    env.neon_cli.create_branch(branchname, "empty")
+    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    endpoint = env.endpoints.create_start(
+        branchname,
+        config_lines=[
+            "shared_buffers='1MB'",
+            f"neon.file_cache_path='{cache_dir}/file.cache'",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+
+    log.info(f"preparing some data in {endpoint.connstr()}")
+
+    ddl = """
+CREATE TABLE pgbench_accounts (
+    aid bigint NOT NULL,
+    bid integer,
+    abalance integer,
+    filler character(84),
+    -- more web-app like columns
+    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
+    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
+)
+WITH (fillfactor='100');
+"""
+
+    cur.execute(ddl)
+    cur.execute(
+        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
+    )
+
+    log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
+    cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
+
+    log.info("running explain analyze without LFC values to verify they do not show up in the plan")
+    cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
+    rows = cur.fetchall()
+    plan = "\n".join(r[0] for r in rows)
+    log.debug(plan)
+    assert "Seq Scan on pgbench_accounts" in plan
+    assert "Buffers: shared hit" in plan
+    assert "File cache: hits=" not in plan
+    log.info("running explain analyze WITH LFC values to verify they do now show up")
+    cur.execute(
+        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
+    )
+    rows = cur.fetchall()
+    plan = "\n".join(r[0] for r in rows)
+    log.debug(plan)
+    assert "Seq Scan on pgbench_accounts" in plan
+    assert "Buffers: shared hit" in plan
+    assert "File cache: hits=" in plan
+    log.info("running explain analyze WITH LFC values to verify json output")
+    cur.execute(
+        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
+    )
+    jsonplan = cur.fetchall()[0][0]
+    log.debug(jsonplan)
+    # Directly access the 'Plan' part of the first element of the JSON array
+    plan_details = jsonplan[0]["Plan"]
+
+    # Extract "File Cache Hits" and "File Cache Misses"
+    file_cache_hits = plan_details.get("File Cache Hits")
+    file_cache_misses = plan_details.get("File Cache Misses")
+
+    # Now you can assert the values
+    assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
+    assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"

From 1efaa16260d081345febe46be26ff01b68053056 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 1 Mar 2024 14:43:33 +0000
Subject: [PATCH 13/18] test: add test for checkpoint timeout flushing (#6950)

## Problem
https://github.com/neondatabase/neon/pull/6661 changed the layer
flushing logic and led to OOMs in staging.
The issue turned out to be holding on to in-memory layers for too long.
After OOMing we'd need to replay potentially
a lot of WAL.

## Summary of changes
Test that open layers get flushed after the `checkpoint_timeout` config
and do not require WAL reingest upon restart.
The workload creates a number of timelines and writes some data to each,
but not enough to trigger flushes via the `checkpoint_distance` config.

I ran this test against https://github.com/neondatabase/neon/pull/6661
and it was indeed failing.
---
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 .../test_pageserver_small_inmemory_layers.py  | 110 ++++++++++++++++++
 2 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/regress/test_pageserver_small_inmemory_layers.py

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c600733e41..cf64c86821 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
             return current_lsn
         if i % 10 == 0:
             log.info(
-                "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                    lsn, current_lsn, i + 1
+                "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
+                    tenant, timeline, lsn, current_lsn, i + 1
                 )
             )
         time.sleep(0.1)
diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py
new file mode 100644
index 0000000000..5d55020e3c
--- /dev/null
+++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py
@@ -0,0 +1,110 @@
+import asyncio
+import time
+from typing import Tuple
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    tenant_get_shards,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
+
+TIMELINE_COUNT = 10
+ENTRIES_PER_TIMELINE = 10_000
+CHECKPOINT_TIMEOUT_SECONDS = 60
+
+TENANT_CONF = {
+    # Large `checkpoint_distance` effectively disables size
+    # based checkpointing.
+    "checkpoint_distance": f"{2 * 1024 ** 3}",
+    "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
+}
+
+
+async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
+    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+        conn = await ep.connect_async()
+        try:
+            await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
+            await conn.execute(
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+            )
+        finally:
+            await conn.close(timeout=10)
+
+        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        return tenant, timeline, last_flush_lsn
+
+
+async def workload(
+    env: NeonEnv, timelines: int, entries: int
+) -> list[Tuple[TenantId, TimelineId, Lsn]]:
+    workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
+    return await asyncio.gather(*workers)
+
+
+def wait_until_pageserver_is_caught_up(
+    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
+):
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        shards = tenant_get_shards(env, tenant)
+        for tenant_shard_id, pageserver in shards:
+            waited = wait_for_last_record_lsn(
+                pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+            )
+            assert waited >= last_flush_lsn
+
+
+def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
+    def query():
+        value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
+        assert value is not None
+        return value
+
+    # The metric gets initialised on the first update.
+    # Retry a few times, but return 0 if it's stable.
+    try:
+        return float(wait_until(3, 0.5, query))
+    except Exception:
+        return 0
+
+
+@pytest.mark.parametrize("immediate_shutdown", [True, False])
+def test_pageserver_small_inmemory_layers(
+    neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
+):
+    """
+    Test that open layers get flushed after the `checkpoint_timeout` config
+    and do not require WAL reingest upon restart.
+
+    The workload creates a number of timelines and writes some data to each,
+    but not enough to trigger flushes via the `checkpoint_distance` config.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    ps_http_client = env.pageserver.http_client()
+    total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info("Sleeping for checkpoint timeout ...")
+    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
+
+    env.pageserver.restart(immediate=immediate_shutdown)
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
+    log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
+
+    leeway = total_wal_ingested_before_restart * 5 / 100
+    assert total_wal_ingested_after_restart <= leeway

From 82853cc1d1047a2efefa40355293ee9f348357ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 1 Mar 2024 17:14:19 +0100
Subject: [PATCH 14/18] Fix warnings and compile errors on nightly (#6886)

Nightly has added a bunch of compiler and linter warnings. There is also
two dependencies that fail compilation on latest nightly due to using
the old `stdsimd` feature name. This PR fixes them.
---
 Cargo.lock                                      |  8 ++++----
 compute_tools/src/compute.rs                    |  2 --
 compute_tools/src/extension_server.rs           |  2 +-
 compute_tools/src/http/api.rs                   |  2 --
 control_plane/attachment_service/src/main.rs    |  2 +-
 .../attachment_service/src/persistence.rs       |  6 ++++--
 .../attachment_service/src/scheduler.rs         |  1 -
 control_plane/src/attachment_service.rs         |  2 +-
 libs/pageserver_api/src/models.rs               |  2 --
 libs/pageserver_api/src/shard.rs                |  6 +-----
 libs/remote_storage/src/local_fs.rs             |  2 --
 libs/remote_storage/src/s3_bucket.rs            |  2 +-
 libs/utils/src/auth.rs                          |  1 -
 libs/utils/src/completion.rs                    |  6 ++++--
 libs/utils/src/http/endpoint.rs                 |  2 +-
 libs/utils/src/lsn.rs                           |  1 -
 libs/utils/src/seqwait.rs                       |  3 +--
 libs/utils/src/simple_rcu.rs                    |  2 +-
 libs/utils/src/sync/heavier_once_cell.rs        |  1 -
 pageserver/compaction/src/helpers.rs            |  1 -
 pageserver/src/config.rs                        |  6 +-----
 .../src/consumption_metrics/metrics/tests.rs    |  2 --
 pageserver/src/deletion_queue.rs                | 17 ++++-------------
 pageserver/src/metrics.rs                       | 11 +++++------
 pageserver/src/page_cache.rs                    | 11 ++++++-----
 pageserver/src/page_service.rs                  |  3 +--
 pageserver/src/repository.rs                    |  1 -
 pageserver/src/tenant.rs                        | 16 +++-------------
 pageserver/src/tenant/disk_btree.rs             |  3 ---
 pageserver/src/tenant/ephemeral_file.rs         |  2 +-
 pageserver/src/tenant/mgr.rs                    |  2 +-
 pageserver/src/tenant/remote_timeline_client.rs |  4 +---
 .../src/tenant/secondary/heatmap_uploader.rs    |  1 -
 .../src/tenant/storage_layer/image_layer.rs     |  1 -
 pageserver/src/walingest.rs                     |  2 --
 pageserver/src/walredo/apply_neon.rs            |  2 --
 pageserver/src/walredo/process/no_leak_child.rs |  4 +---
 proxy/src/bin/pg_sni_router.rs                  |  2 +-
 proxy/src/cache/project_info.rs                 |  3 +--
 proxy/src/console/mgmt.rs                       |  2 +-
 proxy/src/proxy/tests.rs                        |  2 +-
 proxy/src/proxy/tests/mitm.rs                   |  1 -
 proxy/src/serverless/conn_pool.rs               |  1 -
 safekeeper/src/control_file.rs                  |  7 +------
 safekeeper/src/handler.rs                       |  5 ++---
 45 files changed, 51 insertions(+), 114 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dead212156..c23162971e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.5"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
+checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
 dependencies = [
  "cfg-if",
  "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [
 
 [[package]]
 name = "crc32c"
-version = "0.6.3"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
  "rustc_version",
 ]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 142bb14fe5..a82b999cfb 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,8 +18,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use tokio;
-use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 2cec12119f..ef1db73982 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
     }
 }
 */
-use anyhow::{self, Result};
+use anyhow::Result;
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f076951239..128783b477 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use num_cpus;
-use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 5b952ae4fc..d9acbc0abd 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{self, BehaviorVersion, Region};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 1b98cc7655..4c6eb2291c 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,8 +7,10 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::prelude::*;
-use diesel::Connection;
+use diesel::{
+    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
+    Selectable, SelectableHelper,
+};
 use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 3224751e47..87fce3df25 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -284,7 +284,6 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use utils::id::NodeId;
 
     use crate::tenant_state::IntentState;
     #[test]
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 92342b478b..610d7386d9 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -200,7 +200,7 @@ impl AttachmentService {
                 "localhost",
                 "-p",
                 &format!("{}", self.postgres_port),
-                &DB_NAME,
+                DB_NAME,
             ])
             .output()
             .await
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 61aa8a5ae8..d583866290 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -14,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
-use strum_macros;
 use utils::{
     completion,
     history_buffer::HistoryBufferWithDropCounter,
@@ -1077,7 +1076,6 @@ impl PagestreamBeMessage {
 
 #[cfg(test)]
 mod tests {
-    use bytes::Buf;
     use serde_json::json;
 
     use super::*;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 467a4cf0c1..a2a9165184 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
-use thiserror;
 use utils::id::TenantId;
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -656,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
 
 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use bincode;
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;
 
     use super::*;
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 6f847cf9d7..478ad81dc1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
     use super::*;
 
-    use bytes::Bytes;
     use camino_tempfile::tempdir;
-    use futures_util::Stream;
     use std::{collections::HashMap, io::Write};
 
     async fn read_and_check_metadata(
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index af70dc7ca2..438f45fbde 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
             Some("test/prefix/"),
             Some("/test/prefix/"),
         ];
-        let expected_outputs = vec![
+        let expected_outputs = [
             vec!["", "some/path", "some/path"],
             vec!["/", "/some/path", "/some/path"],
             vec![
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index fbf0dff665..03e65f74fe 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,6 @@
 // For details about authentication see docs/authentication.md
 
 use arc_swap::ArcSwap;
-use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
 
 use anyhow::Result;
diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index ea05cf54b1..2fef8d35df 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(TaskTrackerToken);
+pub struct Completion {
+    _token: TaskTrackerToken,
+}
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
     tracker.close();
 
     let token = tracker.token();
-    (Completion(token), Barrier(tracker))
+    (Completion { _token: token }, Barrier(tracker))
 }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 3c71628870..a60971abf0 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{self, debug, info, info_span, warn, Instrument};
+use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
 use std::str::FromStr;
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index b3269ae049..1aebe91428 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,7 +415,6 @@ mod tests {
 
     use super::*;
 
-    use serde::ser::Serialize;
     use serde_assert::{Deserializer, Serializer, Token, Tokens};
 
     #[test]
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index effc9c67b5..b7301776eb 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]
 
-use std::cmp::{Eq, Ordering, PartialOrd};
+use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,7 +249,6 @@ where
 mod tests {
     use super::*;
     use std::sync::Arc;
-    use std::time::Duration;
 
     impl MonotonicCounter<i32> for i32 {
         fn cnt_advance(&mut self, val: i32) {
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index dc4a599111..ecc5353be3 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::sync::{Arc, Mutex};
+    use std::sync::Mutex;
     use std::time::Duration;
 
     #[tokio::test]
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0773abba2d..703a6dfd52 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,7 +239,6 @@ mod tests {
     use std::{
         convert::Infallible,
         pin::{pin, Pin},
-        sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
 
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index a12f691504..22a410b4af 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -6,7 +6,6 @@ use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pin_project_lite::pin_project;
-use std::cmp::Ord;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
 use std::future::Future;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 0a7172bde2..437387164d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -20,7 +20,6 @@ use std::num::NonZeroUsize;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit;
 use toml_edit::{Document, Item};
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -1203,10 +1202,7 @@ impl ConfigurableSemaphore {
 
 #[cfg(test)]
 mod tests {
-    use std::{
-        fs,
-        num::{NonZeroU32, NonZeroUsize},
-    };
+    use std::{fs, num::NonZeroU32};
 
     use camino_tempfile::{tempdir, Utf8TempDir};
     use pageserver_api::models::EvictionPolicy;
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index 38a4c9eb5d..f9cbcea565 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,7 +1,5 @@
 use super::*;
 use std::collections::HashMap;
-use std::time::SystemTime;
-use utils::lsn::Lsn;
 
 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index ca9ae8f983..313eb2663d 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
-use tokio;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use tracing::{self, debug, error};
+use tracing::{debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -726,7 +725,7 @@ mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
     use pageserver_api::shard::ShardIndex;
-    use std::{io::ErrorKind, time::Duration};
+    use std::io::ErrorKind;
     use tracing::info;
 
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -735,10 +734,7 @@ mod test {
     use crate::{
         control_plane_client::RetryForeverError,
         repository::Key,
-        tenant::{
-            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
-            storage_layer::DeltaFileName,
-        },
+        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
     };
 
     use super::*;
@@ -1161,13 +1157,8 @@ mod test {
 pub(crate) mod mock {
     use tracing::info;
 
-    use crate::tenant::remote_timeline_client::remote_layer_path;
-
     use super::*;
-    use std::sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    };
+    use std::sync::atomic::{AtomicUsize, Ordering};
 
     pub struct ConsumerState {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1d894ed8a5..ce5561b431 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1915,17 +1915,16 @@ impl Drop for TimelineMetrics {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ =
-                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
-            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 28d2584bf4..529fb9bb07 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -73,7 +73,6 @@
 
 use std::{
     collections::{hash_map::Entry, HashMap},
-    convert::TryInto,
     sync::{
         atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
         Arc, Weak,
@@ -262,7 +261,9 @@ pub struct PageCache {
     size_metrics: &'static PageCacheSizeMetrics,
 }
 
-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+struct PinnedSlotsPermit {
+    _permit: tokio::sync::OwnedSemaphorePermit,
+}
 
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -558,9 +559,9 @@ impl PageCache {
         )
         .await
         {
-            Ok(res) => Ok(PinnedSlotsPermit(
-                res.expect("this semaphore is never closed"),
-            )),
+            Ok(res) => Ok(PinnedSlotsPermit {
+                _permit: res.expect("this semaphore is never closed"),
+            }),
             Err(_timeout) => {
                 crate::metrics::page_cache_errors_inc(
                     crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index cd9c48f9af..689bc5cb3c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -27,7 +27,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,7 +44,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
-use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index c726139524..9959d105eb 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,6 @@ impl Value {
 mod test {
     use super::*;
 
-    use bytes::Bytes;
     use utils::bin_ser::BeSer;
 
     macro_rules! roundtrip {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f027e9d4b1..4158133111 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -109,7 +109,6 @@ pub use pageserver_api::models::TenantState;
 use tokio::sync::Semaphore;
 
 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
-use toml_edit;
 use utils::{
     crashsafe,
     generation::Generation,
@@ -2384,7 +2383,7 @@ impl Tenant {
             self.tenant_shard_id,
             self.generation,
             self.shard_identity,
-            self.walredo_mgr.as_ref().map(Arc::clone),
+            self.walredo_mgr.clone(),
             resources,
             pg_version,
             state,
@@ -3593,25 +3592,18 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
     use bytes::{Bytes, BytesMut};
-    use camino::Utf8PathBuf;
     use once_cell::sync::OnceCell;
     use pageserver_api::models::ShardParameters;
     use pageserver_api::shard::ShardIndex;
-    use std::fs;
-    use std::sync::Arc;
     use utils::logging;
-    use utils::lsn::Lsn;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::walredo::apply_neon;
-    use crate::{
-        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
-    };
+    use crate::{repository::Key, walrecord::NeonWalRecord};
 
     use super::*;
-    use crate::tenant::config::{TenantConf, TenantConfOpt};
     use hex_literal::hex;
-    use utils::id::{TenantId, TimelineId};
+    use utils::id::TenantId;
 
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3840,10 +3832,8 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
-    use once_cell::sync::Lazy;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
-    use tokio_util::sync::CancellationToken;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 9f104aff86..ca30b0ac4f 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -21,7 +21,6 @@
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;
@@ -700,8 +699,6 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
     use super::*;
-    use crate::context::DownloadBehavior;
-    use crate::task_mgr::TaskKind;
     use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
     use rand::Rng;
     use std::collections::BTreeMap;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 2bedbf7f61..e48b9e83bd 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -300,7 +300,7 @@ mod tests {
     use super::*;
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
+    use crate::tenant::block_io::BlockReaderRef;
     use rand::{thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 805d44f93d..06b61d4631 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(
 
     let tenant = guard
         .get(&tenant_shard_id)
-        .map(Arc::clone)
+        .cloned()
         .with_context(|| format!("tenant {tenant_shard_id}"))
         .map_err(|e| ApiError::NotFound(e.into()))?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 7d30745a0d..40be2ca8f3 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1791,14 +1791,12 @@ mod tests {
         context::RequestContext,
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
-            Generation, Tenant, Timeline,
+            Tenant, Timeline,
         },
         DEFAULT_PG_VERSION,
     };
 
     use std::collections::HashSet;
-    use utils::lsn::Lsn;
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
         format!("contents for {name}").into()
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 147cf683ba..a8b05f4c0e 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -18,7 +18,6 @@ use crate::{
 };
 
 use futures::Future;
-use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 0a707295cc..56cfaeda15 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,7 +43,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
-use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3a2705bb50..63a2b30d09 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1667,8 +1667,6 @@ mod tests {
     use super::*;
     use crate::tenant::harness::*;
     use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
-    use crate::tenant::Timeline;
-    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
     use postgres_ffi::RELSEG_SIZE;
 
     use crate::DEFAULT_PG_VERSION;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 6ce90e0c47..247704e2a5 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -252,8 +252,6 @@ mod test {
     use super::*;
     use std::collections::HashMap;
 
-    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
-
     /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
     #[test]
     fn apply_aux_file_deltas() -> anyhow::Result<()> {
diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs
index ca016408e6..1a0d7039df 100644
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,7 +1,5 @@
-use tracing;
-use tracing::error;
-use tracing::info;
 use tracing::instrument;
+use tracing::{error, info};
 
 use crate::metrics::WalRedoKillCause;
 use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 5024ba3744..d5ab66d6aa 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
-use clap::{self, Arg};
+use clap::Arg;
 use futures::TryFutureExt;
 use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 62015312a9..6e3eb8c1b0 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -358,8 +358,7 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{console::AuthSecret, scram::ServerSecret};
-    use std::{sync::Arc, time::Duration};
+    use crate::scram::ServerSecret;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 373138b09e..c7a2d467c0 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -4,7 +4,7 @@ use crate::{
 };
 use anyhow::Context;
 use once_cell::sync::Lazy;
-use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
+use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::{convert::Infallible, future};
 use tokio::net::{TcpListener, TcpStream};
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 595d9c4979..d866b1820f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{auth, http, sasl, scram};
+use crate::{http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index ed89e51754..e0c2d836f4 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
-use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 53e7c1c2ee..7d705ba049 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -667,7 +667,6 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 
 #[cfg(test)]
 mod tests {
-    use env_logger;
     use std::{mem, sync::atomic::AtomicBool};
 
     use super::*;
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index c39c1dbf28..d822c87c0e 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -19,8 +19,6 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
 
-use std::convert::TryInto;
-
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 7;
 
@@ -219,12 +217,9 @@ impl Storage for FileStorage {
 
 #[cfg(test)]
 mod test {
-    use super::FileStorage;
     use super::*;
-    use crate::SafeKeeperConf;
-    use anyhow::Result;
     use tokio::fs;
-    use utils::{id::TenantTimelineId, lsn::Lsn};
+    use utils::lsn::Lsn;
 
     fn stub_conf() -> SafeKeeperConf {
         let workdir = camino_tempfile::tempdir().unwrap().into_path();
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 761541168c..f45bfb95fa 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,7 @@
 //! protocol commands.
 
 use anyhow::Context;
-use std::str::FromStr;
-use std::str::{self};
+use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
@@ -16,8 +15,8 @@ use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
+use postgres_backend::PostgresBackend;
 use postgres_backend::QueryError;
-use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;

From d999c4669240a6dec67311ae8a10c8e0bd026977 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 16:19:40 +0000
Subject: [PATCH 15/18] pageserver: handle temp_download files in secondary
 locations (#6990)

## Problem

PR #6837 fixed secondary locations to avoid spamming log warnings on
temp files, but we also have ".temp_download" files to consider.

## Summary of changes

- Give temp_download files the same behavior as temp files.
- Refactor the relevant helper to pub(crate) from pub
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 pageserver/src/tenant/secondary/downloader.rs            | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 962cf5d12e..167e18a829 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(
 
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 
-pub fn is_temp_download_file(path: &Utf8Path) -> bool {
+pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
     let extension = path.extension();
     match extension {
         Some(TEMP_DOWNLOAD_EXTENSION) => true,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5c4e4fd160..b679077358 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -16,7 +16,8 @@ use crate::{
         config::SecondaryLocationConfig,
         debug_assert_current_span_has_tenant_and_timeline_id,
         remote_timeline_client::{
-            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
         storage_layer::LayerFileName,
@@ -788,7 +789,7 @@ async fn init_timeline_state(
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
             warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
-        } else if crate::is_temporary(&file_path) {
+        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
             // Temporary files are frequently left behind from restarting during downloads
             tracing::info!("Cleaning up temporary file {file_path}");
             if let Err(e) = tokio::fs::remove_file(&file_path)

From e34059cd185998d8ae60ba3e2086a7258ec6fdb7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 16:49:37 +0000
Subject: [PATCH 16/18] pageserver: increase
 DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG (#6970)

## Problem

At high ingest rates, pageservers spuriously disconnect from safekeepers
because stats updates don't come in frequently enough to keep the
broker/safekeeper LSN delta under the wal lag limit.

## Summary of changes

- Increase DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG from 10MiB to 1GiB. This
should be enough for realistic per-timeline throughputs.
---
 pageserver/src/tenant/config.rs         | 5 ++++-
 test_runner/regress/test_tenant_conf.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 18c4ea664e..9464324413 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -52,7 +52,10 @@ pub mod defaults {
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index a2ffd200a6..fc099297e1 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
         "period": "20s",
         "threshold": "23h",
     }
-    assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
+    assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()

From ea0d35f3ca7b58ba4be820d4a161fd2380806b2b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 1 Mar 2024 14:54:07 -0500
Subject: [PATCH 17/18] neon_local: improved docs and fix wrong connstr (#6954)

The user created with the `--create-test-user` flag is `test` instead of
`user`.

ref https://github.com/neondatabase/neon/pull/6848

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 README.md                     |  2 ++
 control_plane/README.md       | 26 ++++++++++++++++++++++++++
 control_plane/src/endpoint.rs |  2 +-
 3 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 control_plane/README.md

diff --git a/README.md b/README.md
index 95926b4628..c44ae695d6 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,8 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
+More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+
 #### Handling build failures
 
 If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
diff --git a/control_plane/README.md b/control_plane/README.md
new file mode 100644
index 0000000000..827aba5c1f
--- /dev/null
+++ b/control_plane/README.md
@@ -0,0 +1,26 @@
+# Control Plane and Neon Local
+
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+
+## Example: Start with Postgres 16
+
+To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
+
+```shell
+cargo neon init --pg-version 16
+cargo neon start
+cargo neon tenant create --set-default --pg-version 16
+cargo neon endpoint create main --pg-version 16
+cargo neon endpoint start main
+```
+
+## Example: Create Test User and Database
+
+By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
+
+```shell
+cargo neon endpoint create main --pg-version 16 --update-catalog true
+cargo neon endpoint start main --create-test-user true
+```
+
+The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index de7eb797d6..5a75bc2a1d 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -605,7 +605,7 @@ impl Endpoint {
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
         if create_test_user {
-            let conn_str = self.connstr("user", "neondb");
+            let conn_str = self.connstr("test", "neondb");
             println!("Also at '{}'", conn_str);
         }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));

From 20d0939b0032a4ed99359af33f2bbc253de4807a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 20:25:53 +0000
Subject: [PATCH 18/18] control_plane/attachment_service: implement
 PlacementPolicy::Secondary, configuration updates (#6521)

During onboarding, the control plane may attempt ad-hoc creation of a
secondary location to facilitate live migration. This gives us two
problems to solve:
- Accept 'Secondary' mode in /location_config and use it to put the
tenant into secondary mode on some physical pageserver, then pass
through /tenant/xyz/secondary/download requests
- Create tenants with no generation initially, since the initial
`Secondary` mode call will not provide us a generation.

This PR also fixes modification of a tenant's TenantConf during
/location_conf, which was previously ignored, and refines the flow for
config modification:
- avoid bumping generations when the only reason we're reconciling an
attached location is a config change
- increment TenantState.sequence when spawning a reconciler: usually
schedule() does this, but when we do config changes that doesn't happen,
so without this change waiters would think reconciliation was done
immediately. `sequence` is a bit of a murky thing right now, as it's
dual-purposed for tracking waiters, and for checking if an existing
reconciliation is already making updates to our current sequence. I'll
follow up at some point to clarify it's purpose.
- test config modification at the end of onboarding test
---
 .../down.sql                                  |   2 +
 .../2024-02-29-094122_generations_null/up.sql |   4 +
 control_plane/attachment_service/src/http.rs  |  48 +-
 control_plane/attachment_service/src/lib.rs   |  10 +-
 .../attachment_service/src/persistence.rs     | 101 ++-
 .../attachment_service/src/reconciler.rs      |  73 +-
 .../attachment_service/src/schema.rs          |   4 +-
 .../attachment_service/src/service.rs         | 623 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    | 115 +++-
 libs/utils/src/generation.rs                  |   2 +-
 test_runner/regress/test_sharding_service.py  |  91 ++-
 11 files changed, 842 insertions(+), 231 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql

diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
new file mode 100644
index 0000000000..503231f69d
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
new file mode 100644
index 0000000000..7e1e3cfe90
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -0,0 +1,4 @@
+
+
+ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index f1153c2c18..384bdcef0c 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,9 +1,10 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
@@ -117,9 +118,14 @@ async fn handle_tenant_create(
     check_permissions(&req, Scope::PageServerApi)?;
 
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+
+    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
+    // have no expectation of HA).
+    let placement_policy = PlacementPolicy::Single;
+
     json_response(
         StatusCode::CREATED,
-        service.tenant_create(create_req).await?,
+        service.tenant_create(create_req, placement_policy).await?,
     )
 }
 
@@ -185,6 +191,27 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_config_set(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
+
+    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
+}
+
+async fn handle_tenant_config_get(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
+}
+
 async fn handle_tenant_time_travel_remote_storage(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -216,7 +243,15 @@ async fn handle_tenant_time_travel_remote_storage(
             done_if_after_raw,
         )
         .await?;
+    json_response(StatusCode::OK, ())
+}
 
+async fn handle_tenant_secondary_download(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    service.tenant_secondary_download(tenant_id).await?;
     json_response(StatusCode::OK, ())
 }
 
@@ -551,12 +586,21 @@ pub fn make_router(
         .delete("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_delete)
         })
+        .put("/v1/tenant/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_set)
+        })
+        .get("/v1/tenant/:tenant_id/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_get)
+        })
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
             tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
         })
+        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
+            tenant_service_handler(r, handle_tenant_secondary_download)
+        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index ce613e858f..7ae7e264c7 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -13,14 +13,20 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize, Debug)]
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 enum PlacementPolicy {
     /// Cheapest way to attach a tenant: just one pageserver, no secondary
     Single,
     /// Production-ready way to attach a tenant: one attached pageserver and
     /// some number of secondaries.
     Double(usize),
-    /// Do not attach to any pageservers
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
     Detached,
 }
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 4c6eb2291c..d5c304385c 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -333,7 +333,15 @@ impl Persistence {
                 shard_number: ShardNumber(tsp.shard_number as u8),
                 shard_count: ShardCount::new(tsp.shard_count as u8),
             };
-            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+
+            let Some(g) = tsp.generation else {
+                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
+                // we only set generation_pageserver when setting generation.
+                return Err(DatabaseError::Logical(
+                    "Generation should always be set after incrementing".to_string(),
+                ));
+            };
+            result.insert(tenant_shard_id, Generation::new(g as u32));
         }
 
         Ok(result)
@@ -366,7 +374,85 @@ impl Persistence {
             })
             .await?;
 
-        Ok(Generation::new(updated.generation as u32))
+        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
+        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
+        debug_assert!(updated.generation.is_some());
+        let Some(g) = updated.generation else {
+            return Err(DatabaseError::Logical(
+                "Generation should always be set after incrementing".to_string(),
+            )
+            .into());
+        };
+
+        Ok(Generation::new(g as u32))
+    }
+
+    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
+    ///
+    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
+    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
+    /// that we only do the first time a tenant is set to an attached policy via /location_config.
+    pub(crate) async fn update_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
+        input_generation: Option<Generation>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            }
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -377,7 +463,7 @@ impl Persistence {
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                 .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                 .set((
-                    generation_pageserver.eq(i64::MAX),
+                    generation_pageserver.eq(Option::<i64>::None),
                     placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 ))
                 .execute(conn)?;
@@ -503,12 +589,15 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) shard_stripe_size: i32,
 
     // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: i32,
+    // and use the incremented number when attaching.
+    //
+    // Generation is only None when first onboarding a tenant, where it may
+    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
+    pub(crate) generation: Option<i32>,
 
     // Currently attached pageserver
     #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: i64,
+    pub(crate) generation_pageserver: Option<i64>,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index ce91c1f5e9..b633b217c7 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedStale,
-            Some(self.generation),
+            self.generation,
             None,
         );
         self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,16 +335,17 @@ impl Reconciler {
         }
 
         // Increment generation before attaching to new pageserver
-        self.generation = self
-            .persistence
-            .increment_generation(self.tenant_shard_id, dest_ps_id)
-            .await?;
+        self.generation = Some(
+            self.persistence
+                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .await?,
+        );
 
         let dest_conf = build_location_config(
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedMulti,
-            Some(self.generation),
+            self.generation,
             None,
         );
 
@@ -401,7 +402,7 @@ impl Reconciler {
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedSingle,
-            Some(self.generation),
+            self.generation,
             None,
         );
         self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -433,22 +434,62 @@ impl Reconciler {
 
         // If the attached pageserver is not attached, do so now.
         if let Some(node_id) = self.intent.attached {
-            let mut wanted_conf =
-                attached_location_conf(self.generation, &self.shard, &self.config);
+            // If we are in an attached policy, then generation must have been set (null generations
+            // are only present when a tenant is initially loaded with a secondary policy)
+            debug_assert!(self.generation.is_some());
+            let Some(generation) = self.generation else {
+                return Err(ReconcileError::Other(anyhow::anyhow!(
+                    "Attempted to attach with NULL generation"
+                )));
+            };
+
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
                     tracing::info!(%node_id, "Observed configuration already correct.")
                 }
-                _ => {
+                observed => {
                     // In all cases other than a matching observed configuration, we will
                     // reconcile this location.  This includes locations with different configurations, as well
                     // as locations with unknown (None) observed state.
-                    self.generation = self
-                        .persistence
-                        .increment_generation(self.tenant_shard_id, node_id)
-                        .await?;
-                    wanted_conf.generation = self.generation.into();
+
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
+                    let increment_generation = match observed {
+                        None => true,
+                        Some(ObservedStateLocation { conf: None }) => true,
+                        Some(ObservedStateLocation {
+                            conf: Some(observed),
+                        }) => {
+                            let generations_match = observed.generation == wanted_conf.generation;
+
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
+                        }
+                    };
+
+                    if increment_generation {
+                        let generation = self
+                            .persistence
+                            .increment_generation(self.tenant_shard_id, node_id)
+                            .await?;
+                        self.generation = Some(generation);
+                        wanted_conf.generation = generation.into();
+                    }
                     tracing::info!(%node_id, "Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
                     self.compute_notify().await?;
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
index db5a957443..76e4e56a66 100644
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
         shard_number -> Int4,
         shard_count -> Int4,
         shard_stripe_size -> Int4,
-        generation -> Int4,
-        generation_pageserver -> Int8,
+        generation -> Nullable<Int4>,
+        generation_pageserver -> Nullable<Int8>,
         placement_policy -> Varchar,
         splitting -> Int2,
         config -> Text,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 02c1a65545..4209b62db3 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -14,10 +14,13 @@ use control_plane::attachment_service::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
+    models::TenantConfigRequest,
 };
 use pageserver_api::{
     models::{
@@ -65,6 +68,11 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 // some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+// If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
+// tenant shards into this generation, and as long as it remains in this generation, we will accept
+// input generation from future requests as authoritative.
+const INITIAL_GENERATION: Generation = Generation::new(0);
+
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -167,6 +175,21 @@ impl From<ReconcileWaitError> for ApiError {
     }
 }
 
+#[allow(clippy::large_enum_variant)]
+enum TenantCreateOrUpdate {
+    Create((TenantCreateRequest, PlacementPolicy)),
+    Update(Vec<ShardUpdate>),
+}
+
+struct ShardUpdate {
+    tenant_shard_id: TenantShardId,
+    placement_policy: PlacementPolicy,
+    tenant_config: TenantConfig,
+
+    /// If this is None, generation is not updated.
+    generation: Option<Generation>,
+}
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -571,6 +594,9 @@ impl Service {
         // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
         tenant.pending_compute_notification = result.pending_compute_notification;
 
+        // Let the TenantState know it is idle.
+        tenant.reconcile_complete(result.sequence);
+
         match result.result {
             Ok(()) => {
                 for (node_id, loc) in &result.observed.locations {
@@ -661,8 +687,8 @@ impl Service {
             // after when pageservers start up and register.
             let mut node_ids = HashSet::new();
             for tsp in &tenant_shard_persistence {
-                if tsp.generation_pageserver != i64::MAX {
-                    node_ids.insert(tsp.generation_pageserver);
+                if let Some(node_id) = tsp.generation_pageserver {
+                    node_ids.insert(node_id);
                 }
             }
             for node_id in node_ids {
@@ -699,18 +725,15 @@ impl Service {
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
-            if tsp.generation_pageserver != i64::MAX {
-                intent.set_attached(
-                    &mut scheduler,
-                    Some(NodeId(tsp.generation_pageserver as u64)),
-                );
+            if let Some(generation_pageserver) = tsp.generation_pageserver {
+                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
             }
 
             let new_tenant = TenantState {
                 tenant_shard_id,
                 shard: shard_identity,
                 sequence: Sequence::initial(),
-                generation: Generation::new(tsp.generation as u32),
+                generation: tsp.generation.map(|g| Generation::new(g as u32)),
                 policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
                 intent,
                 observed: ObservedState::new(),
@@ -790,8 +813,8 @@ impl Service {
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                 shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: 0,
-                generation: 0,
-                generation_pageserver: i64::MAX,
+                generation: Some(0),
+                generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
@@ -846,7 +869,7 @@ impl Service {
             .expect("Checked for existence above");
 
         if let Some(new_generation) = new_generation {
-            tenant_state.generation = new_generation;
+            tenant_state.generation = Some(new_generation);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during attachment service restart.
@@ -896,7 +919,7 @@ impl Service {
                     node_id,
                     ObservedStateLocation {
                         conf: Some(attached_location_conf(
-                            tenant_state.generation,
+                            tenant_state.generation.unwrap(),
                             &tenant_state.shard,
                             &tenant_state.config,
                         )),
@@ -910,7 +933,7 @@ impl Service {
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
-                .map(|_| tenant_state.generation.into().unwrap()),
+                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
         })
     }
 
@@ -923,7 +946,7 @@ impl Service {
             attachment: tenant_state.and_then(|s| {
                 s.intent
                     .get_attached()
-                    .map(|ps| (s.generation.into().unwrap(), ps))
+                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
             }),
         }
     }
@@ -973,7 +996,17 @@ impl Service {
                 continue;
             };
 
-            shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
+            // If [`Persistence::re_attach`] selected this shard, it must have alread
+            // had a generation set.
+            debug_assert!(shard_state.generation.is_some());
+            let Some(old_gen) = shard_state.generation else {
+                // Should never happen:  would only return incremented generation
+                // for a tenant that already had a non-null generation.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Generation must be set while re-attaching"
+                )));
+            };
+            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
             if let Some(observed) = shard_state
                 .observed
                 .locations
@@ -1003,7 +1036,7 @@ impl Service {
 
         for req_tenant in validate_req.tenants {
             if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Generation::new(req_tenant.gen);
+                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
                 tracing::info!(
                     "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                     req_tenant.id,
@@ -1030,8 +1063,9 @@ impl Service {
     pub(crate) async fn tenant_create(
         &self,
         create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
     ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req).await?;
+        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
         Ok(response)
@@ -1040,6 +1074,7 @@ impl Service {
     pub(crate) async fn do_tenant_create(
         &self,
         create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
@@ -1065,9 +1100,27 @@ impl Service {
             })
             .collect::<Vec<_>>();
 
-        // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-        // have no expectation of HA).
-        let placement_policy: PlacementPolicy = PlacementPolicy::Single;
+        // If the caller specifies a None generation, it means "start from default".  This is different
+        // to [`Self::tenant_location_config`], where a None generation is used to represent
+        // an incompletely-onboarded tenant.
+        let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
+            tracing::info!(
+                "tenant_create: secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            create_req.generation.map(Generation::new)
+        } else {
+            tracing::info!(
+                "tenant_create: not secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            Some(
+                create_req
+                    .generation
+                    .map(Generation::new)
+                    .unwrap_or(INITIAL_GENERATION),
+            )
+        };
 
         // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
         // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1079,8 +1132,10 @@ impl Service {
                 shard_number: tenant_shard_id.shard_number.0 as i32,
                 shard_count: tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
-                generation_pageserver: i64::MAX,
+                generation: initial_generation.map(|g| g.into().unwrap() as i32),
+                // The pageserver is not known until scheduling happens: we will set this column when
+                // incrementing the generation the first time we attach to a pageserver.
+                generation_pageserver: None,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
                 splitting: SplitState::default(),
@@ -1120,15 +1175,17 @@ impl Service {
                             ))
                         })?;
 
-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: entry
+                        if let Some(node_id) = entry.get().intent.get_attached() {
+                            let generation = entry
                                 .get()
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: entry.get().generation.into().unwrap(),
-                        });
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
 
                         continue;
                     }
@@ -1142,9 +1199,7 @@ impl Service {
                             placement_policy.clone(),
                         );
 
-                        if let Some(create_gen) = create_req.generation {
-                            state.generation = Generation::new(create_gen);
-                        }
+                        state.generation = initial_generation;
                         state.config = create_req.config.clone();
 
                         state.schedule(scheduler).map_err(|e| {
@@ -1153,14 +1208,18 @@ impl Service {
                             ))
                         })?;
 
-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: state
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: state.generation.into().unwrap(),
-                        });
+                        // Only include shards in result if we are attaching: the purpose
+                        // of the response is to tell the caller where the shards are attached.
+                        if let Some(node_id) = state.intent.get_attached() {
+                            let generation = state
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
                         entry.insert(state)
                     }
                 };
@@ -1214,12 +1273,114 @@ impl Service {
         Ok(())
     }
 
-    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
-    /// - Call with mode Attached* to upsert the tenant.
-    /// - Call with mode Detached to switch to PolicyMode::Detached
+    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
+    /// and transform it into either a tenant creation of a series of shard updates.
+    fn tenant_location_config_prepare(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> TenantCreateOrUpdate {
+        let mut updates = Vec::new();
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+        // Use location config mode as an indicator of policy.
+        let placement_policy = match req.config.mode {
+            LocationConfigMode::Detached => PlacementPolicy::Detached,
+            LocationConfigMode::Secondary => PlacementPolicy::Secondary,
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                if nodes.len() > 1 {
+                    PlacementPolicy::Double(1)
+                } else {
+                    // Convenience for dev/test: if we just have one pageserver, import
+                    // tenants into Single mode so that scheduling will succeed.
+                    PlacementPolicy::Single
+                }
+            }
+        };
+
+        let mut create = true;
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            // Saw an existing shard: this is not a creation
+            create = false;
+
+            // Shards may have initially been created by a Secondary request, where we
+            // would have left generation as None.
+            //
+            // We only update generation the first time we see an attached-mode request,
+            // and if there is no existing generation set. The caller is responsible for
+            // ensuring that no non-storage-controller pageserver ever uses a higher
+            // generation than they passed in here.
+            use LocationConfigMode::*;
+            let set_generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
+                    req.config.generation.map(Generation::new)
+                }
+                _ => None,
+            };
+
+            if shard.policy != placement_policy
+                || shard.config != req.config.tenant_conf
+                || set_generation.is_some()
+            {
+                updates.push(ShardUpdate {
+                    tenant_shard_id: *shard_id,
+                    placement_policy: placement_policy.clone(),
+                    tenant_config: req.config.tenant_conf.clone(),
+                    generation: set_generation,
+                });
+            }
+        }
+
+        if create {
+            use LocationConfigMode::*;
+            let generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
+                // If a caller provided a generation in a non-attached request, ignore it
+                // and leave our generation as None: this enables a subsequent update to set
+                // the generation when setting an attached mode for the first time.
+                _ => None,
+            };
+
+            TenantCreateOrUpdate::Create(
+                // Synthesize a creation request
+                (
+                    TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation,
+                        shard_parameters: ShardParameters {
+                            // Must preserve the incoming shard_count do distinguish unsharded (0)
+                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                            count: req.tenant_id.shard_count,
+                            // We only import un-sharded or single-sharded tenants, so stripe
+                            // size can be made up arbitrarily here.
+                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        },
+                        config: req.config.tenant_conf,
+                    },
+                    placement_policy,
+                ),
+            )
+        } else {
+            TenantCreateOrUpdate::Update(updates)
+        }
+    }
+
+    /// This API is used by the cloud control plane to migrate unsharded tenants that it created
+    /// directly with pageservers into this service.
     ///
-    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
-    /// secondary locations.
+    /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
+    /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
+    /// Think of the first attempt to call this API as a transfer of absolute authority over the
+    /// tenant's source of generation numbers.
+    ///
+    /// The mode in this request coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Secondary to either onboard a tenant without attaching it, or
+    ///   to set an existing tenant to PolicyMode::Secondary
+    /// - Call with mode Detached to switch to PolicyMode::Detached
     pub(crate) async fn tenant_location_config(
         &self,
         tenant_id: TenantId,
@@ -1231,131 +1392,96 @@ impl Service {
             )));
         }
 
-        let mut waiters = Vec::new();
+        // First check if this is a creation or an update
+        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+
         let mut result = TenantLocationConfigResponse { shards: Vec::new() };
-        let maybe_create = {
-            let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
+        let waiters = match create_or_update {
+            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
+                let (create_resp, waiters) =
+                    self.do_tenant_create(create_req, placement_policy).await?;
+                result.shards = create_resp
+                    .shards
+                    .into_iter()
+                    .map(|s| TenantShardLocation {
+                        node_id: s.node_id,
+                        shard_id: s.shard_id,
+                    })
+                    .collect();
+                waiters
+            }
+            TenantCreateOrUpdate::Update(updates) => {
+                // Persist updates
+                // Ordering: write to the database before applying changes in-memory, so that
+                // we will not appear time-travel backwards on a restart.
+                for ShardUpdate {
+                    tenant_shard_id,
+                    placement_policy,
+                    tenant_config,
+                    generation,
+                } in &updates
+                {
+                    self.persistence
+                        .update_tenant_shard(
+                            *tenant_shard_id,
+                            placement_policy.clone(),
+                            tenant_config.clone(),
+                            *generation,
+                        )
+                        .await?;
+                }
 
-            // Maybe we have existing shards
-            let mut create = true;
-            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-                // Saw an existing shard: this is not a creation
-                create = false;
+                // Apply updates in-memory
+                let mut waiters = Vec::new();
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let result_tx = locked.result_tx.clone();
+                    let compute_hook = locked.compute_hook.clone();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
 
-                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
-                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
-                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
-                // cloud control plane into this service.
+                    for ShardUpdate {
+                        tenant_shard_id,
+                        placement_policy,
+                        tenant_config,
+                        generation: update_generation,
+                    } in updates
+                    {
+                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                            tracing::warn!("Shard {tenant_shard_id} removed while updating");
+                            continue;
+                        };
 
-                // Use location config mode as an indicator of policy: if they ask for
-                // attached we go to default HA attached mode.  If they ask for secondary
-                // we go to secondary-only mode.  If they ask for detached we detach.
-                match req.config.mode {
-                    LocationConfigMode::Detached => {
-                        shard.policy = PlacementPolicy::Detached;
-                    }
-                    LocationConfigMode::Secondary => {
-                        // TODO: implement secondary-only mode.
-                        todo!();
-                    }
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // TODO: persistence for changes in policy
-                        if nodes.len() > 1 {
-                            shard.policy = PlacementPolicy::Double(1)
-                        } else {
-                            // Convenience for dev/test: if we just have one pageserver, import
-                            // tenants into Single mode so that scheduling will succeed.
-                            shard.policy = PlacementPolicy::Single
+                        shard.policy = placement_policy;
+                        shard.config = tenant_config;
+                        if let Some(generation) = update_generation {
+                            shard.generation = Some(generation);
+                        }
+
+                        shard.schedule(scheduler)?;
+
+                        let maybe_waiter = shard.maybe_reconcile(
+                            result_tx.clone(),
+                            nodes,
+                            &compute_hook,
+                            &self.config,
+                            &self.persistence,
+                            &self.gate,
+                            &self.cancel,
+                        );
+                        if let Some(waiter) = maybe_waiter {
+                            waiters.push(waiter);
+                        }
+
+                        if let Some(node_id) = shard.intent.get_attached() {
+                            result.shards.push(TenantShardLocation {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                            })
                         }
                     }
                 }
-
-                shard.schedule(scheduler)?;
-
-                let maybe_waiter = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                );
-                if let Some(waiter) = maybe_waiter {
-                    waiters.push(waiter);
-                }
-
-                if let Some(node_id) = shard.intent.get_attached() {
-                    result.shards.push(TenantShardLocation {
-                        shard_id: *shard_id,
-                        node_id: *node_id,
-                    })
-                }
+                waiters
             }
-
-            if create {
-                // Validate request mode
-                match req.config.mode {
-                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
-                        // When using this API to onboard an existing tenant to this service, it must start in
-                        // an attached state, because we need the request to come with a generation
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Imported tenant must be in attached mode"
-                        )));
-                    }
-
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // Pass
-                    }
-                }
-
-                // Validate request generation
-                let Some(generation) = req.config.generation else {
-                    // We can only import attached tenants, because we need the request to come with a generation
-                    return Err(ApiError::BadRequest(anyhow::anyhow!(
-                        "Generation is mandatory when importing tenant"
-                    )));
-                };
-
-                // Synthesize a creation request
-                Some(TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: Some(generation),
-                    shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
-                        // We only import un-sharded or single-sharded tenants, so stripe
-                        // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                    },
-                    config: req.config.tenant_conf,
-                })
-            } else {
-                None
-            }
-        };
-
-        let waiters = if let Some(create_req) = maybe_create {
-            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
-            result.shards = create_resp
-                .shards
-                .into_iter()
-                .map(|s| TenantShardLocation {
-                    node_id: s.node_id,
-                    shard_id: s.shard_id,
-                })
-                .collect();
-            waiters
-        } else {
-            waiters
         };
 
         if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1375,6 +1501,91 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        let tenant_id = req.tenant_id;
+        let config = req.config;
+
+        self.persistence
+            .update_tenant_config(req.tenant_id, config.clone())
+            .await?;
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+                shard.config = config.clone();
+                if let Some(waiter) = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    nodes,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
+                ) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Treat this as success because we have stored the configuration.  If e.g.
+            // a node was unavailable at this time, it should not stop us accepting a
+            // configuration change.
+            tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn tenant_config_get(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
+        let config = {
+            let locked = self.inner.read().unwrap();
+
+            match locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .next()
+            {
+                Some((_tenant_shard_id, shard)) => shard.config.clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant not found").into(),
+                    ))
+                }
+            }
+        };
+
+        // Unlike the pageserver, we do not have a set of global defaults: the config is
+        // entirely per-tenant.  Therefore the distinction between `tenant_specific_overrides`
+        // and `effective_config` in the response is meaningless, but we retain that syntax
+        // in order to remain compatible with the pageserver API.
+
+        let response = HashMap::from([
+            (
+                "tenant_specific_overrides",
+                serde_json::to_value(&config)
+                    .context("serializing tenant specific overrides")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+            (
+                "effective_config",
+                serde_json::to_value(&config)
+                    .context("serializing effective config")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+        ]);
+
+        Ok(response)
+    }
+
     pub(crate) async fn tenant_time_travel_remote_storage(
         &self,
         time_travel_req: &TenantTimeTravelRequest,
@@ -1460,6 +1671,60 @@ impl Service {
                         })?;
             }
         }
+        Ok(())
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(), ApiError> {
+        // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                for node_id in shard.intent.get_secondary() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
+        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
+
+        // Issue concurrent requests to all shards' locations
+        let mut futs = FuturesUnordered::new();
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            futs.push(async move {
+                let result = client.tenant_secondary_download(tenant_shard_id).await;
+                (result, node)
+            })
+        }
+
+        // Handle any errors returned by pageservers.  This includes cases like this request racing with
+        // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
+        // well as more general cases like 503s, 500s, or timeouts.
+        while let Some((result, node)) = futs.next().await {
+            let Err(e) = result else { continue };
+
+            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+            // than they had hoped for.
+            tracing::warn!(
+                "Ignoring tenant secondary download error from pageserver {}: {e}",
+                node.id,
+            );
+        }
 
         Ok(())
     }
@@ -2039,8 +2304,8 @@ impl Service {
                     // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                     // populate the correct generation as part of its transaction, to protect us
                     // against racing with changes in the state of the parent.
-                    generation: 0,
-                    generation_pageserver: target.node.id.0 as i64,
+                    generation: None,
+                    generation_pageserver: Some(target.node.id.0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     // TODO: get the config out of the map
                     config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2161,7 +2426,8 @@ impl Service {
                         .expect("It was present, we just split it");
                     let old_attached = old_state.intent.get_attached().unwrap();
                     old_state.intent.clear(scheduler);
-                    (old_attached, old_state.generation, old_state.config.clone())
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (old_attached, generation, old_state.config.clone())
                 };
 
                 for child in child_ids {
@@ -2182,7 +2448,7 @@ impl Service {
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
-                    child_state.generation = generation;
+                    child_state.generation = Some(generation);
                     child_state.config = config.clone();
 
                     // The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2247,6 +2513,7 @@ impl Service {
                 match shard.policy {
                     PlacementPolicy::Single => {
                         shard.intent.clear_secondary(scheduler);
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                     }
                     PlacementPolicy::Double(_n) => {
                         // If our new attached node was a secondary, it no longer should be.
@@ -2256,6 +2523,12 @@ impl Service {
                         if let Some(old_attached) = old_attached {
                             shard.intent.push_secondary(scheduler, old_attached);
                         }
+
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+                    }
+                    PlacementPolicy::Secondary => {
+                        shard.intent.clear(scheduler);
+                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
                     }
                     PlacementPolicy::Detached => {
                         return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2263,9 +2536,6 @@ impl Service {
                         )))
                     }
                 }
-                shard
-                    .intent
-                    .set_attached(scheduler, Some(migrate_req.node_id));
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
@@ -2593,7 +2863,7 @@ impl Service {
                     observed_loc.conf = None;
                 }
 
-                if tenant_state.intent.notify_offline(config_req.node_id) {
+                if tenant_state.intent.demote_attached(config_req.node_id) {
                     tenant_state.sequence = tenant_state.sequence.next();
                     match tenant_state.schedule(scheduler) {
                         Err(e) => {
@@ -2660,6 +2930,9 @@ impl Service {
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
+    ///
+    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
+    /// an attached policy.  We should error out if it isn't.
     fn ensure_attached_schedule(
         &self,
         mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c14fe6699e..33b7d578c7 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -53,8 +53,11 @@ pub(crate) struct TenantState {
     pub(crate) sequence: Sequence,
 
     // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: Generation,
+    // and use the incremented number when attaching.
+    //
+    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
+    // API, where this tenant may only run in PlacementPolicy::Secondary.
+    pub(crate) generation: Option<Generation>,
 
     // High level description of how the tenant should be set up.  Provided
     // externally.
@@ -181,6 +184,13 @@ impl IntentState {
         }
     }
 
+    /// Remove the last secondary node from the list of secondaries
+    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
+        if let Some(node_id) = self.secondary.pop() {
+            scheduler.node_dec_ref(node_id);
+        }
+    }
+
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
             scheduler.node_dec_ref(old_attached);
@@ -208,11 +218,13 @@ impl IntentState {
         &self.secondary
     }
 
-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
+    /// If the node is in use as the attached location, demote it into
+    /// the list of secondary locations.  This is used when a node goes offline,
+    /// and we want to use a different node for attachment, but not permanently
+    /// forget the location on the offline node.
     ///
     /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
             // TODO: when scheduler starts tracking attached + secondary counts separately, we will
             // need to call into it here.
@@ -315,7 +327,7 @@ pub(crate) struct ReconcileResult {
     pub(crate) result: Result<(), ReconcileError>,
 
     pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
     pub(crate) observed: ObservedState,
 
     /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -340,7 +352,7 @@ impl TenantState {
             tenant_shard_id,
             policy,
             intent: IntentState::default(),
-            generation: Generation::new(0),
+            generation: Some(Generation::new(0)),
             shard,
             observed: ObservedState::default(),
             config: TenantConfig::default(),
@@ -438,10 +450,16 @@ impl TenantState {
         // more work on the same pageservers we're already using.
         let mut modified = false;
 
+        // Add/remove nodes to fulfil policy
         use PlacementPolicy::*;
         match self.policy {
             Single => {
                 // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
                 let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                 modified |= modified_attached;
 
@@ -451,6 +469,23 @@ impl TenantState {
                 }
             }
             Double(secondary_count) => {
+                let retain_secondaries = if self.intent.attached.is_none()
+                    && scheduler.node_preferred(&self.intent.secondary).is_some()
+                {
+                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
+                    // one more secondary than we usually would, as one of them will become attached futher down this function.
+                    secondary_count + 1
+                } else {
+                    secondary_count
+                };
+
+                while self.intent.secondary.len() > retain_secondaries {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+
                 // Should have exactly one attached, and N secondaries
                 let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                 modified |= modified_attached;
@@ -463,15 +498,28 @@ impl TenantState {
                     modified = true;
                 }
             }
-            Detached => {
-                // Should have no attached or secondary pageservers
-                if self.intent.attached.is_some() {
-                    self.intent.set_attached(scheduler, None);
+            Secondary => {
+                if let Some(node_id) = self.intent.get_attached() {
+                    // Populate secondary by demoting the attached node
+                    self.intent.demote_attached(*node_id);
+                    modified = true;
+                } else if self.intent.secondary.is_empty() {
+                    // Populate secondary by scheduling a fresh node
+                    let node_id = scheduler.schedule_shard(&[])?;
+                    self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                while self.intent.secondary.len() > 1 {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Detached => {
+                // Never add locations in this mode
+                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
+                    self.intent.clear(scheduler);
                     modified = true;
                 }
             }
@@ -518,7 +566,12 @@ impl TenantState {
 
     fn dirty(&self) -> bool {
         if let Some(node_id) = self.intent.attached {
-            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            // Maybe panic: it is a severe bug if we try to attach while generation is null.
+            let generation = self
+                .generation
+                .expect("Attempted to enter attached state without a generation");
+
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
@@ -596,6 +649,10 @@ impl TenantState {
         // Reconcile already in flight for the current sequence?
         if let Some(handle) = &self.reconciler {
             if handle.sequence == self.sequence {
+                tracing::info!(
+                    "Reconciliation already in progress for sequence {:?}",
+                    self.sequence,
+                );
                 return Some(ReconcilerWaiter {
                     tenant_shard_id: self.tenant_shard_id,
                     seq_wait: self.waiter.clone(),
@@ -615,6 +672,10 @@ impl TenantState {
             return None;
         };
 
+        // Advance the sequence before spawning a reconciler, so that sequence waiters
+        // can distinguish between before+after the reconcile completes.
+        self.sequence = self.sequence.next();
+
         let reconciler_cancel = cancel.child_token();
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
@@ -716,6 +777,17 @@ impl TenantState {
         })
     }
 
+    /// Called when a ReconcileResult has been emitted and the service is updating
+    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
+    /// the handle to indicate there is no longer a reconciliation in progress.
+    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
+        if let Some(reconcile_handle) = &self.reconciler {
+            if reconcile_handle.sequence <= sequence {
+                self.reconciler = None;
+            }
+        }
+    }
+
     // If we had any state at all referring to this node ID, drop it.  Does not
     // attempt to reschedule.
     pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -736,13 +808,8 @@ impl TenantState {
             shard_number: self.tenant_shard_id.shard_number.0 as i32,
             shard_count: self.tenant_shard_id.shard_count.literal() as i32,
             shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: self
-                .intent
-                .get_attached()
-                .map(|n| n.0 as i64)
-                .unwrap_or(i64::MAX),
-
+            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
+            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
             placement_policy: serde_json::to_string(&self.policy).unwrap(),
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
@@ -805,8 +872,10 @@ pub(crate) mod tests {
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
         assert!(changed);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
         nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 6f6c46cfeb..af15cee924 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
         Self::Broken
     }
 
-    pub fn new(v: u32) -> Self {
+    pub const fn new(v: u32) -> Self {
         Self::Valid(v)
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index c8224c1c67..bc77dfd084 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -146,6 +146,8 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
 
+    env.attachment_service.consistency_check()
+
     # Set a scheduling policy on one node, create all the tenants, observe
     # that the scheduling policy is respected.
     env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
@@ -256,9 +258,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env.attachment_service.consistency_check()
 
 
-def test_sharding_service_onboarding(
-    neon_env_builder: NeonEnvBuilder,
-):
+@pytest.mark.parametrize("warm_up", [True, False])
+def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
     """
     We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
     which provides the /location_config API.  This is similar to creating a tenant,
@@ -306,6 +307,23 @@ def test_sharding_service_onboarding(
         },
     )
 
+    if warm_up:
+        origin_ps.http_client().tenant_heatmap_upload(tenant_id)
+
+        # We expect to be called via live migration code, which may try to configure the tenant into secondary
+        # mode before attaching it.
+        virtual_ps_http.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+                "generation": None,
+            },
+        )
+
+        virtual_ps_http.tenant_secondary_download(tenant_id)
+
     # Call into attachment service to onboard the tenant
     generation += 1
     virtual_ps_http.tenant_location_conf(
@@ -351,7 +369,9 @@ def test_sharding_service_onboarding(
     assert len(dest_tenants) == 1
     assert TenantId(dest_tenants[0]["id"]) == tenant_id
 
-    # sharding service advances generation by 1 when it first attaches
+    # sharding service advances generation by 1 when it first attaches.  We started
+    # with a nonzero generation so this equality also proves that the generation
+    # was properly carried over during onboarding.
     assert dest_tenants[0]["generation"] == generation + 1
 
     # The onboarded tenant should survive a restart of sharding service
@@ -362,6 +382,31 @@ def test_sharding_service_onboarding(
     dest_ps.stop()
     dest_ps.start()
 
+    # Having onboarded via /location_config, we should also be able to update the
+    # TenantConf part of LocationConf, without inadvertently resetting the generation
+    modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
+    dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
+
+    # The generation has moved on since we onboarded
+    assert generation != dest_tenant_before_conf_change["generation"]
+
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": modified_tenant_conf,
+            # This is intentionally a stale generation
+            "generation": generation,
+        },
+    )
+    dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
+    assert (
+        dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
+    )
+    dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+    assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
+
     env.attachment_service.consistency_check()
 
 
@@ -667,3 +712,41 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         svc.request(
             "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
         )
+
+
+def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
+    supplying the whole LocationConf.
+    """
+
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+
+    http = env.attachment_service.pageserver_api()
+
+    default_value = "7days"
+    new_value = "1h"
+    http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
+
+    # Ensure the change landed on the storage controller
+    readback_controller = http.tenant_config(tenant_id)
+    assert readback_controller.effective_config["pitr_interval"] == new_value
+    assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
+
+    # Ensure the change made it down to the pageserver
+    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
+    assert readback_ps.effective_config["pitr_interval"] == new_value
+    assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
+
+    # Omitting a value clears it.  This looks different in storage controller
+    # vs. pageserver API calls, because pageserver has defaults.
+    http.set_tenant_config(tenant_id, {})
+    readback_controller = http.tenant_config(tenant_id)
+    assert readback_controller.effective_config["pitr_interval"] is None
+    assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
+    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
+    assert readback_ps.effective_config["pitr_interval"] == default_value
+    assert "pitr_interval" not in readback_ps.tenant_specific_overrides
+
+    env.attachment_service.consistency_check()