From 58332cb3615954bedb317bbbf311df47310d4a03 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 10 Jan 2025 21:35:50 +0100
Subject: [PATCH 01/32] pageserver: remove unused metric
 `pageserver_layers_visited_per_read_global` (#10141)

As of commit "pageserver: remove legacy read path" (#8601) we always use
vectored get, which has a separate metric.
---
 pageserver/src/metrics.rs              | 10 ----------
 test_runner/fixtures/metrics.py        |  1 -
 test_runner/regress/test_compaction.py | 12 +-----------
 3 files changed, 1 insertion(+), 22 deletions(-)
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a313a64080..5b8419fda9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_layers_visited_per_vectored_read_global",
@@ -3894,7 +3885,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
 
     // histograms
     [
-        &READ_NUM_LAYERS_VISITED,
         &VEC_READ_NUM_LAYERS_VISITED,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c5295360c3..fa541bad17 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -131,7 +131,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_layers_visited_per_read_global"),
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index ae48a8fc27..fe0422088a 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -84,9 +84,6 @@ page_cache_size=10
     log.info("Checking layer access metrics ...")
 
     layer_access_metric_names = [
-        "pageserver_layers_visited_per_read_global_sum",
-        "pageserver_layers_visited_per_read_global_count",
-        "pageserver_layers_visited_per_read_global_bucket",
         "pageserver_layers_visited_per_vectored_read_global_sum",
         "pageserver_layers_visited_per_vectored_read_global_count",
         "pageserver_layers_visited_per_vectored_read_global_bucket",
@@ -97,12 +94,6 @@ page_cache_size=10
         layer_access_metrics = metrics.query_all(name)
         log.info(f"Got metrics: {layer_access_metrics}")
 
-    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
-    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
-    if non_vectored_count.value != 0:
-        non_vectored_average = non_vectored_sum.value / non_vectored_count.value
-    else:
-        non_vectored_average = 0
     vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
     vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
     if vectored_count.value > 0:
@@ -113,11 +104,10 @@ page_cache_size=10
         assert vectored_sum.value == 0
         vectored_average = 0
 
-    log.info(f"{non_vectored_average=} {vectored_average=}")
+    log.info(f"{vectored_average=}")
 
     # The upper bound for average number of layer visits below (8)
     # was chosen empirically for this workload.
-    assert non_vectored_average < 8
     assert vectored_average < 8
 
 

From b5d54ba52a92f263c087eff56ab7ced9499c1ae6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:53:00 -0500
Subject: [PATCH 02/32] refactor(pageserver): move queue logic to compaction.rs
 (#10330)

## Problem

close https://github.com/neondatabase/neon/issues/10031, part of
https://github.com/neondatabase/neon/issues/9114

## Summary of changes

Move the compaction job generation to `compaction.rs`, thus making the
code more readable and debuggable. We now also return running job
through the get compaction job API, versus before we only return
scheduled jobs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs            |   2 +
 pageserver/src/http/routes.rs                |  14 +-
 pageserver/src/tenant.rs                     | 175 +++--------
 pageserver/src/tenant/timeline/compaction.rs | 293 ++++++++++++++++++-
 4 files changed, 324 insertions(+), 160 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 39390d7647..9af6c4021d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct CompactInfoResponse {
     pub compact_key_range: Option<CompactKeyRange>,
     pub compact_lsn_range: Option<CompactLsnRange>,
     pub sub_compaction: bool,
+    pub running: bool,
+    pub job_id: usize,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 60ef4c3702..94e0b101bd 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
-    TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
     auth::SwappableJwtAuth,
@@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler(
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
-        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
-        let mut resp = Vec::new();
-        for item in res {
-            resp.push(CompactInfoResponse {
-                compact_key_range: item.compact_key_range,
-                compact_lsn_range: item.compact_lsn_range,
-                sub_compaction: item.sub_compaction,
-            });
-        }
+        let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
         json_response(StatusCode::OK, resp)
     }
     .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8e61d09de7..2928c435cb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
-use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::GcCompactJob;
-use timeline::compaction::ScheduledCompactionTask;
+use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::offload::OffloadError;
-use timeline::CompactFlags;
 use timeline::CompactOptions;
-use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -347,10 +344,8 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
-    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
-    /// a manual gc-compaction from the manual compaction API.
-    scheduled_compaction_tasks:
-        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+    /// Scheduled gc-compaction tasks.
+    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
 
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -2997,104 +2992,18 @@ impl Tenant {
                 if has_pending_l0_compaction_task {
                     Some(true)
                 } else {
-                    let mut has_pending_scheduled_compaction_task;
-                    let next_scheduled_compaction_task = {
-                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
-                            if !tline_pending_tasks.is_empty() {
-                                info!(
-                                    "{} tasks left in the compaction schedule queue",
-                                    tline_pending_tasks.len()
-                                );
-                            }
-                            let next_task = tline_pending_tasks.pop_front();
-                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
-                            next_task
-                        } else {
-                            has_pending_scheduled_compaction_task = false;
-                            None
-                        }
+                    let queue = {
+                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        guard.get(timeline_id).cloned()
                     };
-                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
-                    {
-                        if !next_scheduled_compaction_task
-                            .options
-                            .flags
-                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
-                        {
-                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
-                        } else if next_scheduled_compaction_task.options.sub_compaction {
-                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs: Vec<GcCompactJob> = timeline
-                                .gc_compaction_split_jobs(
-                                    GcCompactJob::from_compact_options(
-                                        next_scheduled_compaction_task.options.clone(),
-                                    ),
-                                    next_scheduled_compaction_task
-                                        .options
-                                        .sub_compaction_max_job_size_mb,
-                                )
-                                .await
-                                .map_err(CompactionError::Other)?;
-                            if jobs.is_empty() {
-                                info!("no jobs to run, skipping scheduled compaction task");
-                            } else {
-                                has_pending_scheduled_compaction_task = true;
-                                let jobs_len = jobs.len();
-                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
-                                for (idx, job) in jobs.into_iter().enumerate() {
-                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
-                                    // until we do further refactors to allow directly call `compact_with_gc`.
-                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
-                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
-                                    if job.dry_run {
-                                        flags |= CompactFlags::DryRun;
-                                    }
-                                    let options = CompactOptions {
-                                        flags,
-                                        sub_compaction: false,
-                                        compact_key_range: Some(job.compact_key_range.into()),
-                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
-                                        sub_compaction_max_job_size_mb: None,
-                                    };
-                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            // The last job in the queue sends the signal and releases the gc guard
-                                            result_tx: next_scheduled_compaction_task
-                                                .result_tx
-                                                .take(),
-                                            gc_block: next_scheduled_compaction_task
-                                                .gc_block
-                                                .take(),
-                                        }
-                                    } else {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            result_tx: None,
-                                            gc_block: None,
-                                        }
-                                    });
-                                }
-                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
-                            }
-                        } else {
-                            let _ = timeline
-                                .compact_with_options(
-                                    cancel,
-                                    next_scheduled_compaction_task.options,
-                                    ctx,
-                                )
-                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
-                                .await?;
-                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
-                                // TODO: we can send compaction statistics in the future
-                                tx.send(()).ok();
-                            }
-                        }
+                    if let Some(queue) = queue {
+                        let has_pending_tasks = queue
+                            .iteration(cancel, ctx, &self.gc_block, timeline)
+                            .await?;
+                        Some(has_pending_tasks)
+                    } else {
+                        Some(false)
                     }
-                    Some(has_pending_scheduled_compaction_task)
                 }
             } else {
                 None
@@ -3124,34 +3033,32 @@ impl Tenant {
     }
 
     /// Cancel scheduled compaction tasks
-    pub(crate) fn cancel_scheduled_compaction(
-        &self,
-        timeline_id: TimelineId,
-    ) -> Vec<ScheduledCompactionTask> {
+    pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
-            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
-            current_tline_pending_tasks.into_iter().collect()
-        } else {
-            Vec::new()
+        if let Some(q) = guard.get_mut(&timeline_id) {
+            q.cancel_scheduled();
         }
     }
 
     pub(crate) fn get_scheduled_compaction_tasks(
         &self,
         timeline_id: TimelineId,
-    ) -> Vec<CompactOptions> {
-        use itertools::Itertools;
-        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-        guard
-            .get(&timeline_id)
-            .map(|tline_pending_tasks| {
-                tline_pending_tasks
-                    .iter()
-                    .map(|x| x.options.clone())
-                    .collect_vec()
-            })
-            .unwrap_or_default()
+    ) -> Vec<CompactInfoResponse> {
+        let res = {
+            let guard = self.scheduled_compaction_tasks.lock().unwrap();
+            guard.get(&timeline_id).map(|q| q.remaining_jobs())
+        };
+        let Some((running, remaining)) = res else {
+            return Vec::new();
+        };
+        let mut result = Vec::new();
+        if let Some((id, running)) = running {
+            result.extend(running.into_compact_info_resp(id, true));
+        }
+        for (id, job) in remaining {
+            result.extend(job.into_compact_info_resp(id, false));
+        }
+        result
     }
 
     /// Schedule a compaction task for a timeline.
@@ -3160,20 +3067,12 @@ impl Tenant {
         timeline_id: TimelineId,
         options: CompactOptions,
     ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
-        let gc_guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(e) => {
-                bail!("cannot run gc-compaction because gc is blocked: {}", e);
-            }
-        };
         let (tx, rx) = tokio::sync::oneshot::channel();
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        let tline_pending_tasks = guard.entry(timeline_id).or_default();
-        tline_pending_tasks.push_back(ScheduledCompactionTask {
-            options,
-            result_tx: Some(tx),
-            gc_block: Some(gc_guard),
-        });
+        let q = guard
+            .entry(timeline_id)
+            .or_insert_with(|| Arc::new(GcCompactionQueue::new()));
+        q.schedule_manual_compaction(options, Some(tx));
         Ok(rx)
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 55cde8603e..05f8d476f9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -16,10 +16,12 @@ use super::{
 
 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
+use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
@@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
+use crate::tenant::gc_block::GcBlock;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -63,16 +66,284 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
-/// A scheduled compaction task.
-pub(crate) struct ScheduledCompactionTask {
-    /// It's unfortunate that we need to store a compact options struct here because the only outer
-    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
-    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
-    pub options: CompactOptions,
-    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
-    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
-    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
-    pub gc_block: Option<gc_block::Guard>,
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct GcCompactionJobId(pub usize);
+
+impl std::fmt::Display for GcCompactionJobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum GcCompactionQueueItem {
+    Manual(CompactOptions),
+    SubCompactionJob(CompactOptions),
+    #[allow(dead_code)]
+    UpdateL2Lsn(Lsn),
+    Notify(GcCompactionJobId),
+}
+
+impl GcCompactionQueueItem {
+    pub fn into_compact_info_resp(
+        self,
+        id: GcCompactionJobId,
+        running: bool,
+    ) -> Option<CompactInfoResponse> {
+        match self {
+            GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::UpdateL2Lsn(_) => None,
+            GcCompactionQueueItem::Notify(_) => None,
+        }
+    }
+}
+
+struct GcCompactionQueueInner {
+    running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
+    gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
+    last_id: GcCompactionJobId,
+}
+
+impl GcCompactionQueueInner {
+    fn next_id(&mut self) -> GcCompactionJobId {
+        let id = self.last_id;
+        self.last_id = GcCompactionJobId(id.0 + 1);
+        id
+    }
+}
+
+/// A structure to store gc_compaction jobs.
+pub struct GcCompactionQueue {
+    /// All items in the queue, and the currently-running job.
+    inner: std::sync::Mutex<GcCompactionQueueInner>,
+    /// Ensure only one thread is consuming the queue.
+    consumer_lock: tokio::sync::Mutex<()>,
+}
+
+impl GcCompactionQueue {
+    pub fn new() -> Self {
+        GcCompactionQueue {
+            inner: std::sync::Mutex::new(GcCompactionQueueInner {
+                running: None,
+                queued: VecDeque::new(),
+                notify: HashMap::new(),
+                gc_guards: HashMap::new(),
+                last_id: GcCompactionJobId(0),
+            }),
+            consumer_lock: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn cancel_scheduled(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.queued.clear();
+        guard.notify.clear();
+        guard.gc_guards.clear();
+    }
+
+    /// Schedule a manual compaction job.
+    pub fn schedule_manual_compaction(
+        &self,
+        options: CompactOptions,
+        notify: Option<tokio::sync::oneshot::Sender<()>>,
+    ) -> GcCompactionJobId {
+        let mut guard = self.inner.lock().unwrap();
+        let id = guard.next_id();
+        guard
+            .queued
+            .push_back((id, GcCompactionQueueItem::Manual(options)));
+        if let Some(notify) = notify {
+            guard.notify.insert(id, notify);
+        }
+        info!("scheduled compaction job id={}", id);
+        id
+    }
+
+    /// Trigger an auto compaction.
+    #[allow(dead_code)]
+    pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
+
+    /// Notify the caller the job has finished and unblock GC.
+    fn notify_and_unblock(&self, id: GcCompactionJobId) {
+        info!("compaction job id={} finished", id);
+        let mut guard = self.inner.lock().unwrap();
+        if let Some(blocking) = guard.gc_guards.remove(&id) {
+            drop(blocking)
+        }
+        if let Some(tx) = guard.notify.remove(&id) {
+            let _ = tx.send(());
+        }
+    }
+
+    async fn handle_sub_compaction(
+        &self,
+        id: GcCompactionJobId,
+        options: CompactOptions,
+        timeline: &Arc<Timeline>,
+        gc_block: &GcBlock,
+    ) -> Result<(), CompactionError> {
+        info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+        let jobs: Vec<GcCompactJob> = timeline
+            .gc_compaction_split_jobs(
+                GcCompactJob::from_compact_options(options.clone()),
+                options.sub_compaction_max_job_size_mb,
+            )
+            .await
+            .map_err(CompactionError::Other)?;
+        if jobs.is_empty() {
+            info!("no jobs to run, skipping scheduled compaction task");
+            self.notify_and_unblock(id);
+        } else {
+            let gc_guard = match gc_block.start().await {
+                Ok(guard) => guard,
+                Err(e) => {
+                    return Err(CompactionError::Other(anyhow!(
+                        "cannot run gc-compaction because gc is blocked: {}",
+                        e
+                    )));
+                }
+            };
+
+            let jobs_len = jobs.len();
+            let mut pending_tasks = Vec::new();
+            for job in jobs {
+                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                // until we do further refactors to allow directly call `compact_with_gc`.
+                let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                if job.dry_run {
+                    flags |= CompactFlags::DryRun;
+                }
+                let options = CompactOptions {
+                    flags,
+                    sub_compaction: false,
+                    compact_key_range: Some(job.compact_key_range.into()),
+                    compact_lsn_range: Some(job.compact_lsn_range.into()),
+                    sub_compaction_max_job_size_mb: None,
+                };
+                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
+            }
+            pending_tasks.push(GcCompactionQueueItem::Notify(id));
+            {
+                let mut guard = self.inner.lock().unwrap();
+                guard.gc_guards.insert(id, gc_guard);
+                let mut tasks = Vec::new();
+                for task in pending_tasks {
+                    let id = guard.next_id();
+                    tasks.push((id, task));
+                }
+                tasks.reverse();
+                for item in tasks {
+                    guard.queued.push_front(item);
+                }
+            }
+            info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+        }
+        Ok(())
+    }
+
+    /// Take a job from the queue and process it. Returns if there are still pending tasks.
+    pub async fn iteration(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+        gc_block: &GcBlock,
+        timeline: &Arc<Timeline>,
+    ) -> Result<bool, CompactionError> {
+        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
+        let has_pending_tasks;
+        let (id, item) = {
+            let mut guard = self.inner.lock().unwrap();
+            let Some((id, item)) = guard.queued.pop_front() else {
+                return Ok(false);
+            };
+            guard.running = Some((id, item.clone()));
+            has_pending_tasks = !guard.queued.is_empty();
+            (id, item)
+        };
+
+        match item {
+            GcCompactionQueueItem::Manual(options) => {
+                if !options
+                    .flags
+                    .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                {
+                    warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
+                } else if options.sub_compaction {
+                    self.handle_sub_compaction(id, options, timeline, gc_block)
+                        .await?;
+                } else {
+                    let gc_guard = match gc_block.start().await {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            return Err(CompactionError::Other(anyhow!(
+                                "cannot run gc-compaction because gc is blocked: {}",
+                                e
+                            )));
+                        }
+                    };
+                    {
+                        let mut guard = self.inner.lock().unwrap();
+                        guard.gc_guards.insert(id, gc_guard);
+                    }
+                    let _ = timeline
+                        .compact_with_options(cancel, options, ctx)
+                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                        .await?;
+                    self.notify_and_unblock(id);
+                }
+            }
+            GcCompactionQueueItem::SubCompactionJob(options) => {
+                let _ = timeline
+                    .compact_with_options(cancel, options, ctx)
+                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                    .await?;
+            }
+            GcCompactionQueueItem::Notify(id) => {
+                self.notify_and_unblock(id);
+            }
+            GcCompactionQueueItem::UpdateL2Lsn(_) => {
+                unreachable!()
+            }
+        }
+        {
+            let mut guard = self.inner.lock().unwrap();
+            guard.running = None;
+        }
+        Ok(has_pending_tasks)
+    }
+
+    #[allow(clippy::type_complexity)]
+    pub fn remaining_jobs(
+        &self,
+    ) -> (
+        Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+        VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    ) {
+        let guard = self.inner.lock().unwrap();
+        (guard.running.clone(), guard.queued.clone())
+    }
+
+    #[allow(dead_code)]
+    pub fn remaining_jobs_num(&self) -> usize {
+        let guard = self.inner.lock().unwrap();
+        guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
+    }
 }
 
 /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will

From 23c0748cdd3d07c67f805d29281b1e0aea967a4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 11 Jan 2025 03:52:45 +0100
Subject: [PATCH 03/32] Remove active column (#10335)

We don't need or want the `active` column. Remove it. Vlad pointed out
that this is safe.

Thanks to the separation of the schemata in earlier PRs, this is easy.

follow-up of #10205

Part of https://github.com/neondatabase/neon/issues/9981
---
 .../2025-01-09-160454_safekeepers_remove_active/down.sql   | 4 ++++
 .../2025-01-09-160454_safekeepers_remove_active/up.sql     | 1 +
 storage_controller/src/persistence.rs                      | 7 ++-----
 storage_controller/src/schema.rs                           | 1 -
 4 files changed, 7 insertions(+), 6 deletions(-)
 create mode 100644 storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
 create mode 100644 storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql

diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
new file mode 100644
index 0000000000..c2624f858b
--- /dev/null
+++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
@@ -0,0 +1,4 @@
+-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table.
+-- But preserving order is not a trivial operation.
+-- https://wiki.postgresql.org/wiki/Alter_column_position
+ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false;
diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql
new file mode 100644
index 0000000000..d76f044eda
--- /dev/null
+++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql
@@ -0,0 +1 @@
+ALTER TABLE safekeepers DROP active;
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index cebf3e9594..beb014f0a8 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1258,7 +1258,6 @@ pub(crate) struct SafekeeperPersistence {
     pub(crate) version: i64,
     pub(crate) host: String,
     pub(crate) port: i32,
-    pub(crate) active: bool,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
     pub(crate) scheduling_policy: String,
@@ -1270,7 +1269,6 @@ impl SafekeeperPersistence {
             SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {
                 DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}"))
             })?;
-        // omit the `active` flag on purpose: it is deprecated.
         Ok(SafekeeperDescribeResponse {
             id: NodeId(self.id as u64),
             region_id: self.region_id.clone(),
@@ -1295,7 +1293,8 @@ pub(crate) struct SafekeeperUpsert {
     pub(crate) version: i64,
     pub(crate) host: String,
     pub(crate) port: i32,
-    pub(crate) active: bool,
+    /// The active flag will not be stored in the database and will be ignored.
+    pub(crate) active: Option<bool>,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
 }
@@ -1311,7 +1310,6 @@ impl SafekeeperUpsert {
             version: self.version,
             host: &self.host,
             port: self.port,
-            active: self.active,
             http_port: self.http_port,
             availability_zone_id: &self.availability_zone_id,
             // None means a wish to not update this column. We expose abilities to update it via other means.
@@ -1328,7 +1326,6 @@ struct InsertUpdateSafekeeper<'a> {
     version: i64,
     host: &'a str,
     port: i32,
-    active: bool,
     http_port: i32,
     availability_zone_id: &'a str,
     scheduling_policy: Option<&'a str>,
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 44c91619ab..14c30c296d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -36,7 +36,6 @@ diesel::table! {
         version -> Int8,
         host -> Text,
         port -> Int4,
-        active -> Bool,
         http_port -> Int4,
         availability_zone_id -> Text,
         scheduling_policy -> Varchar,

From 70a3bf37a0d2df87ba4bab0f5d73466ecbabfb90 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 11 Jan 2025 15:09:55 +0200
Subject: [PATCH 04/32] Stop building 'compute-tools' image (#10333)

It's been unused from time immemorial.

---------

Co-authored-by: Matthias van de Meent <matthias@neon.tech>
---
 .github/workflows/build_and_test.yml | 47 ++--------------------------
 compute/compute-node.Dockerfile      | 11 -------
 compute_tools/src/bin/fast_import.rs |  2 +-
 docs/docker.md                       |  6 +---
 4 files changed, 5 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 01f5c3ede9..cd95a5b16d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -728,30 +728,6 @@ jobs:
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        # We pick 16, because that builds on debian 11 with older glibc (and is
-        # thus compatible with newer glibc), rather than 17 on Debian 12, as
-        # that isn't guaranteed to be compatible with Debian 11
-        if: matrix.version.pg == 'v16'
-        uses: docker/build-push-action@v6
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
-            DEBIAN_VERSION=${{ matrix.version.debian }}
-          provenance: false
-          push: true
-          pull: true
-          file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
-          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
-
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
     permissions:
@@ -794,14 +770,6 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-      - name: Create multi-arch compute-tools image
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
-
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
@@ -817,12 +785,6 @@ jobs:
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                                                                 neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, large ]
@@ -1001,9 +963,6 @@ jobs:
             docker buildx imagetools create -t $repo/neon:latest \
                                                $repo/neon:${{ needs.tag.outputs.build-tag }}
 
-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
-
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
                                                  $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -1032,7 +991,7 @@ jobs:
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
@@ -1044,7 +1003,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1056,7 +1015,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 303daec240..255dafa401 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1288,17 +1288,6 @@ USER nonroot
 COPY --chown=nonroot . .
 RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
 
-#########################################################################################
-#
-# Final compute-tools image
-#
-#########################################################################################
-
-FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
-
 #########################################################################################
 #
 # Layer "pgbouncer"
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 793ec4cf10..f554362751 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -17,7 +17,7 @@
 //!
 //! # Local Testing
 //!
-//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
+//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
 //! - Build the image with the following command:
 //!
 //! ```bash
diff --git a/docs/docker.md b/docs/docker.md
index 0914a00082..ae74c2b2ab 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,15 +7,11 @@ Currently we build two main images:
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
 
-And additional intermediate image:
-
-- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
-
 ## Build pipeline
 
 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
 
-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
 
 2. `neondatabase/neon`
 

From 846e8fdce4a9e6a06941c865575350343007c2a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 11 Jan 2025 16:20:50 +0200
Subject: [PATCH 05/32] Remove obsolete hnsw extension (#8008)

This has been deprecated and disabled for new installations for a long
time. Let's remove it for good.
---
 compute/compute-node.Dockerfile |  14 -
 pgxn/hnsw/Makefile              |  26 --
 pgxn/hnsw/README.md             |  25 --
 pgxn/hnsw/hnsw--0.1.0.sql       |  29 --
 pgxn/hnsw/hnsw.c                | 590 --------------------------------
 pgxn/hnsw/hnsw.control          |   4 -
 pgxn/hnsw/hnsw.h                |  15 -
 pgxn/hnsw/hnswalg.cpp           | 379 --------------------
 pgxn/hnsw/hnswalg.h             |  69 ----
 pgxn/hnsw/test/expected/knn.out |  28 --
 pgxn/hnsw/test/sql/knn.sql      |  13 -
 11 files changed, 1192 deletions(-)
 delete mode 100644 pgxn/hnsw/Makefile
 delete mode 100644 pgxn/hnsw/README.md
 delete mode 100644 pgxn/hnsw/hnsw--0.1.0.sql
 delete mode 100644 pgxn/hnsw/hnsw.c
 delete mode 100644 pgxn/hnsw/hnsw.control
 delete mode 100644 pgxn/hnsw/hnsw.h
 delete mode 100644 pgxn/hnsw/hnswalg.cpp
 delete mode 100644 pgxn/hnsw/hnswalg.h
 delete mode 100644 pgxn/hnsw/test/expected/knn.out
 delete mode 100644 pgxn/hnsw/test/sql/knn.sql

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 255dafa401..f4df507b74 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1258,20 +1258,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
     make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_rmgr \
-        -s install && \
-    case "${PG_VERSION}" in \
-        "v14" | "v15") \
-        ;; \
-        "v16" | "v17") \
-            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
-        ;; \
-        *) \
-            echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-        esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/hnsw \
         -s install
 
 #########################################################################################
diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile
deleted file mode 100644
index 66436b5920..0000000000
--- a/pgxn/hnsw/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-EXTENSION = hnsw
-EXTVERSION = 0.1.0
-
-MODULE_big = hnsw
-DATA = $(wildcard *--*.sql)
-OBJS = hnsw.o hnswalg.o
-
-TESTS = $(wildcard test/sql/*.sql)
-REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
-REGRESS_OPTS = --inputdir=test --load-extension=hnsw
-
-# For auto-vectorization:
-# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
-PG_CFLAGS += -O3
-PG_CXXFLAGS +=  -O3 -std=c++11
-PG_LDFLAGS += -lstdc++
-
-all: $(EXTENSION)--$(EXTVERSION).sql
-
-PG_CONFIG ?= pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-
-dist:
-	mkdir -p dist
-	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md
deleted file mode 100644
index bc9c8d571c..0000000000
--- a/pgxn/hnsw/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
-
-This ANN extension of Postgres is based
-on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
-the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
-
-[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
-<br>
-Dmitry Baranchuk, Artem Babenko, Yury Malkov
-
-# Postgres extension
-
-HNSW index is hold in memory (built on demand) and it's maxial size is limited
-by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
-Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
-described in the article).
-
-# Example of usage:
-
-```
-create extension hnsw;
-create table embeddings(id integer primary key, payload real[]);
-create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
-select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
-```
\ No newline at end of file
diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql
deleted file mode 100644
index ebf424326d..0000000000
--- a/pgxn/hnsw/hnsw--0.1.0.sql
+++ /dev/null
@@ -1,29 +0,0 @@
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-
--- functions
-
-CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
-	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-
--- operators
-
-CREATE OPERATOR <-> (
-	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
-	COMMUTATOR = '<->'
-);
-
--- access method
-
-CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
-	AS 'MODULE_PATHNAME' LANGUAGE C;
-
-CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
-
-COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-
--- opclasses
-
-CREATE OPERATOR CLASS knn_ops
-	DEFAULT FOR TYPE real[] USING hnsw AS
-	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
deleted file mode 100644
index e624cb831f..0000000000
--- a/pgxn/hnsw/hnsw.c
+++ /dev/null
@@ -1,590 +0,0 @@
-#include "postgres.h"
-
-#include "access/amapi.h"
-#include "access/generic_xlog.h"
-#include "access/relation.h"
-#include "access/reloptions.h"
-#include "access/tableam.h"
-#include "catalog/index.h"
-#include "commands/vacuum.h"
-#include "nodes/execnodes.h"
-#include "storage/bufmgr.h"
-#include "utils/guc.h"
-#include "utils/selfuncs.h"
-
-#include <math.h>
-#include <float.h>
-
-#include "hnsw.h"
-
-PG_MODULE_MAGIC;
-
-typedef struct {
-	int32 vl_len_;		/* varlena header (do not touch directly!) */
-	int dims;
-	int maxelements;
-	int efConstruction;
-	int efSearch;
-	int M;
-} HnswOptions;
-
-static relopt_kind hnsw_relopt_kind;
-
-typedef struct {
-	HierarchicalNSW* hnsw;
-	size_t curr;
-	size_t n_results;
-	ItemPointer results;
-} HnswScanOpaqueData;
-
-typedef HnswScanOpaqueData* HnswScanOpaque;
-
-typedef struct {
-	Oid relid;
-	uint32 status;
-	HierarchicalNSW* hnsw;
-} HnswHashEntry;
-
-
-#define SH_PREFIX			 hnsw_index
-#define SH_ELEMENT_TYPE		 HnswHashEntry
-#define SH_KEY_TYPE			 Oid
-#define SH_KEY				 relid
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	 ((a)->relid)
-#define SH_HASH_KEY(tb, key) (key)
-#define SH_EQUAL(tb, a, b)	((a) == (b))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-#define INDEX_HASH_SIZE     11
-
-#define DEFAULT_EF_SEARCH   64
-
-PGDLLEXPORT void _PG_init(void);
-
-static hnsw_index_hash *hnsw_indexes;
-
-/*
- * Initialize index options and variables
- */
-void
-_PG_init(void)
-{
-	hnsw_relopt_kind = add_reloption_kind();
-	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
-					  100, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
-					  16, 1, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
-					  64, 1, INT_MAX, AccessExclusiveLock);
-	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
-}
-
-
-static void
-hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
-					bool *isnull, bool tupleIsAlive, void *state)
-{
-	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return;
-
-	array = DatumGetArrayTypeP(values[0]);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-
-	memcpy(&label, tid, sizeof(*tid));
-	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
-}
-
-static void
-hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
-{
-	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
-	Assert(indexInfo->ii_NumIndexAttrs == 1);
-	table_index_build_scan(heapRel, indexRel, indexInfo,
-						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
-}
-
-#ifdef __APPLE__
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	size_t total;
-	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#else
-
-#include <sys/sysinfo.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	struct sysinfo si;
-	Size total;
-	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	total = si.totalram*si.mem_unit;
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#endif
-
-static HierarchicalNSW*
-hnsw_get_index(Relation indexRel, Relation heapRel)
-{
-	HierarchicalNSW* hnsw;
-	Oid indexoid = RelationGetRelid(indexRel);
-	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
-	if (entry == NULL)
-	{
-		size_t dims, maxelements;
-		size_t M;
-		size_t maxM;
-		size_t size_links_level0;
-		size_t size_data_per_element;
-		size_t data_size;
-		dsm_handle handle = indexoid << 1; /* make it even */
-		void* impl_private = NULL;
-		void* mapped_address = NULL;
-		Size  mapped_size = 0;
-		Size  shmem_size;
-		bool exists = true;
-		bool found;
-		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
-		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
-			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
-		}
-		dims = opts->dims;
-		maxelements = opts->maxelements;
-		M = opts->M;
-		maxM = M * 2;
-		data_size = dims * sizeof(coord_t);
-		size_links_level0 = (maxM + 1) * sizeof(idx_t);
-		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
-		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
-
-		hnsw_check_available_memory(shmem_size);
-
-		/* first try to attach to existed index */
-		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-						 &mapped_address, &mapped_size, DEBUG1))
-		{
-			/* index doesn't exists: try to create it */
-			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
-							 &mapped_address, &mapped_size, DEBUG1))
-			{
-				/* We can do it under shared lock, so some other backend may
-				 * try to initialize index. If create is failed because index already
-				 * created by somebody else, then try to attach to it once again
-				 */
-				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-								 &mapped_address, &mapped_size, ERROR))
-				{
-					return NULL;
-				}
-			}
-			else
-			{
-				exists = false;
-			}
-		}
-		Assert(mapped_size == shmem_size);
-		hnsw = (HierarchicalNSW*)mapped_address;
-
-		if (!exists)
-		{
-			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
-			hnsw_populate(hnsw, indexRel, heapRel);
-		}
-		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
-		Assert(!found);
-		entry->hnsw = hnsw;
-	}
-	else
-	{
-		hnsw = entry->hnsw;
-	}
-	return hnsw;
-}
-
-/*
- * Start or restart an index scan
- */
-static IndexScanDesc
-hnsw_beginscan(Relation index, int nkeys, int norderbys)
-{
-	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
-	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
-	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
-	so->hnsw = hnsw_get_index(index, heap);
-	relation_close(heap, NoLock);
-	so->curr = 0;
-	so->n_results = 0;
-	so->results = NULL;
-	scan->opaque = so;
-	return scan;
-}
-
-/*
- * Start or restart an index scan
- */
-static void
-hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-	{
-		pfree(so->results);
-		so->results = NULL;
-	}
-	so->curr = 0;
-	if (orderbys && scan->numberOfOrderBys > 0)
-		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
-}
-
-/*
- * Fetch the next tuple in the given scan
- */
-static bool
-hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-
-	/*
-	 * Index can be used to scan backward, but Postgres doesn't support
-	 * backward scan on operators
-	 */
-	Assert(ScanDirectionIsForward(dir));
-
-	if (so->curr == 0)
-	{
-		Datum		value;
-		ArrayType*	array;
-		int         n_items;
-		size_t      n_results;
-		label_t*    results;
-		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
-		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
-
-		/* Safety check */
-		if (scan->orderByData == NULL)
-			elog(ERROR, "cannot scan HNSW index without order");
-
-		/* No items will match if null */
-		if (scan->orderByData->sk_flags & SK_ISNULL)
-			return false;
-
-		value = scan->orderByData->sk_argument;
-		array = DatumGetArrayTypeP(value);
-		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-		if (n_items != hnsw_dimensions(so->hnsw))
-		{
-			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-				 n_items, hnsw_dimensions(so->hnsw));
-		}
-
-		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
-			elog(ERROR, "HNSW index search failed");
-		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
-		so->n_results = n_results;
-		for (size_t i = 0; i < n_results; i++)
-		{
-			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
-		}
-		free(results);
-	}
-	if (so->curr >= so->n_results)
-	{
-		return false;
-	}
-	else
-	{
-		scan->xs_heaptid = so->results[so->curr++];
-		scan->xs_recheckorderby = false;
-		return true;
-	}
-}
-
-/*
- * End a scan and release resources
- */
-static void
-hnsw_endscan(IndexScanDesc scan)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-		pfree(so->results);
-	pfree(so);
-	scan->opaque = NULL;
-}
-
-
-/*
- * Estimate the cost of an index scan
- */
-static void
-hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
-				 Cost *indexStartupCost, Cost *indexTotalCost,
-				 Selectivity *indexSelectivity, double *indexCorrelation
-				 ,double *indexPages
-)
-{
-	GenericCosts costs;
-
-	/* Never use index without order */
-	if (path->indexorderbys == NULL)
-	{
-		*indexStartupCost = DBL_MAX;
-		*indexTotalCost = DBL_MAX;
-		*indexSelectivity = 0;
-		*indexCorrelation = 0;
-		*indexPages = 0;
-		return;
-	}
-
-	MemSet(&costs, 0, sizeof(costs));
-
-	genericcostestimate(root, path, loop_count, &costs);
-
-	/* Startup cost and total cost are same */
-	*indexStartupCost = costs.indexTotalCost;
-	*indexTotalCost = costs.indexTotalCost;
-	*indexSelectivity = costs.indexSelectivity;
-	*indexCorrelation = costs.indexCorrelation;
-	*indexPages = costs.numIndexPages;
-}
-
-/*
- * Parse and validate the reloptions
- */
-static bytea *
-hnsw_options(Datum reloptions, bool validate)
-{
-	static const relopt_parse_elt tab[] = {
-		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
-		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
-		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
-		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
-		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
-	};
-
-	return (bytea *) build_reloptions(reloptions, validate,
-									  hnsw_relopt_kind,
-									  sizeof(HnswOptions),
-									  tab, lengthof(tab));
-}
-
-/*
- * Validate catalog entries for the specified operator class
- */
-static bool
-hnsw_validate(Oid opclassoid)
-{
-	return true;
-}
-
-/*
- * Build the index for a logged table
- */
-static IndexBuildResult *
-hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
-
-	return result;
-}
-
-/*
- * Insert a tuple into the index
- */
-static bool
-hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
-			  Relation heap, IndexUniqueCheck checkUnique,
-			  bool indexUnchanged,
-			  IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	Datum value;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return false;
-
-	/* Detoast value */
-	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
-	array = DatumGetArrayTypeP(value);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-	memcpy(&label, heap_tid, sizeof(*heap_tid));
-	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
-		elog(ERROR, "HNSW index insert failed");
-	return true;
-}
-
-/*
- * Build the index for an unlogged table
- */
-static void
-hnsw_buildempty(Relation index)
-{
-	/* index will be constructed on dema nd when accessed */
-}
-
-/*
- * Clean up after a VACUUM operation
- */
-static IndexBulkDeleteResult *
-hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
-{
-	Relation	rel = info->index;
-
-	if (stats == NULL)
-		return NULL;
-
-	stats->num_pages = RelationGetNumberOfBlocks(rel);
-
-	return stats;
-}
-
-/*
- * Bulk delete tuples from the index
- */
-static IndexBulkDeleteResult *
-hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-				IndexBulkDeleteCallback callback, void *callback_state)
-{
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	return stats;
-}
-
-/*
- * Define index handler
- *
- * See https://www.postgresql.org/docs/current/index-api.html
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
-Datum
-hnsw_handler(PG_FUNCTION_ARGS)
-{
-	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
-
-	amroutine->amstrategies = 0;
-	amroutine->amsupport = 0;
-	amroutine->amoptsprocnum = 0;
-	amroutine->amcanorder = false;
-	amroutine->amcanorderbyop = true;
-	amroutine->amcanbackward = false;	/* can change direction mid-scan */
-	amroutine->amcanunique = false;
-	amroutine->amcanmulticol = false;
-	amroutine->amoptionalkey = true;
-	amroutine->amsearcharray = false;
-	amroutine->amsearchnulls = false;
-	amroutine->amstorage = false;
-	amroutine->amclusterable = false;
-	amroutine->ampredlocks = false;
-	amroutine->amcanparallel = false;
-	amroutine->amcaninclude = false;
-	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
-	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
-	amroutine->amkeytype = InvalidOid;
-
-	/* Interface functions */
-	amroutine->ambuild = hnsw_build;
-	amroutine->ambuildempty = hnsw_buildempty;
-	amroutine->aminsert = hnsw_insert;
-	amroutine->ambulkdelete = hnsw_bulkdelete;
-	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
-	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
-	amroutine->amcostestimate = hnsw_costestimate;
-	amroutine->amoptions = hnsw_options;
-	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
-	amroutine->ambuildphasename = NULL;
-	amroutine->amvalidate = hnsw_validate;
-	amroutine->amadjustmembers = NULL;
-	amroutine->ambeginscan = hnsw_beginscan;
-	amroutine->amrescan = hnsw_rescan;
-	amroutine->amgettuple = hnsw_gettuple;
-	amroutine->amgetbitmap = NULL;
-	amroutine->amendscan = hnsw_endscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
-
-	/* Interface functions to support parallel index scans */
-	amroutine->amestimateparallelscan = NULL;
-	amroutine->aminitparallelscan = NULL;
-	amroutine->amparallelrescan = NULL;
-
-	PG_RETURN_POINTER(amroutine);
-}
-
-/*
- * Get the L2 distance between vectors
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
-Datum
-l2_distance(PG_FUNCTION_ARGS)
-{
-	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
-	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
-	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
-	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
-	dist_t 		distance = 0.0;
-	dist_t		diff;
-	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
-	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
-
-	if (a_dim != b_dim)
-	{
-		ereport(ERROR,
-				(errcode(ERRCODE_DATA_EXCEPTION),
-				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
-	}
-
-	for (int i = 0; i < a_dim; i++)
-	{
-		diff = ax[i] - bx[i];
-		distance += diff * diff;
-	}
-
-	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
-}
diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control
deleted file mode 100644
index fbfa1a5b47..0000000000
--- a/pgxn/hnsw/hnsw.control
+++ /dev/null
@@ -1,4 +0,0 @@
-comment = '** Deprecated ** Please use pg_embedding instead'
-default_version = '0.1.0'
-module_pathname = '$libdir/hnsw'
-relocatable = true
diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h
deleted file mode 100644
index d4065ab8fe..0000000000
--- a/pgxn/hnsw/hnsw.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-typedef float    coord_t;
-typedef float    dist_t;
-typedef uint32_t idx_t;
-typedef uint64_t label_t;
-
-typedef struct HierarchicalNSW HierarchicalNSW;
-
-bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
-bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
-void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
-int  hnsw_dimensions(HierarchicalNSW* hnsw);
-size_t hnsw_count(HierarchicalNSW* hnsw);
-size_t hnsw_sizeof(void);
diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp
deleted file mode 100644
index f6de3b8314..0000000000
--- a/pgxn/hnsw/hnswalg.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-#include "hnswalg.h"
-
-#if defined(__GNUC__)
-#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
-#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
-#else
-#define PORTABLE_ALIGN32 __declspec(align(32))
-#define PREFETCH(addr,hint)
-#endif
-
-HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
-{
-    dim = dim_;
-    data_size = dim * sizeof(coord_t);
-
-    efConstruction = efConstruction_;
-
-    maxelements = maxelements_;
-    M = M_;
-    maxM = maxM_;
-    size_links_level0 = (maxM + 1) * sizeof(idx_t);
-    size_data_per_element = size_links_level0 + data_size  + sizeof(label_t);
-    offset_data = size_links_level0;
-	offset_label = offset_data + data_size;
-
-    enterpoint_node = 0;
-    cur_element_count = 0;
-#ifdef __x86_64__
-    use_avx2 = __builtin_cpu_supports("avx2");
-#endif
-}
-
-std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
-{
-	std::vector<uint32_t> visited;
-	visited.resize((cur_element_count + 31) >> 5);
-
-    std::priority_queue<std::pair<dist_t, idx_t >> topResults;
-    std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
-
-    dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
-
-    topResults.emplace(dist, enterpoint_node);
-    candidateSet.emplace(-dist, enterpoint_node);
-    visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
-    dist_t lowerBound = dist;
-
-    while (!candidateSet.empty())
-    {
-        std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
-        if (-curr_el_pair.first > lowerBound)
-            break;
-
-        candidateSet.pop();
-        idx_t curNodeNum = curr_el_pair.second;
-
-        idx_t* data = get_linklist0(curNodeNum);
-        size_t size = *data++;
-
-        PREFETCH(getDataByInternalId(*data), 0);
-
-        for (size_t j = 0; j < size; ++j) {
-            size_t tnum = *(data + j);
-
-            PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
-
-            if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
-				visited[tnum >> 5] |= 1 << (tnum & 31);
-
-                dist = fstdistfunc(point, getDataByInternalId(tnum));
-
-                if (topResults.top().first > dist || topResults.size() < ef) {
-                    candidateSet.emplace(-dist, tnum);
-
-                    PREFETCH(get_linklist0(candidateSet.top().second), 0);
-                    topResults.emplace(dist, tnum);
-
-                    if (topResults.size() > ef)
-                        topResults.pop();
-
-                    lowerBound = topResults.top().first;
-                }
-            }
-        }
-    }
-    return topResults;
-}
-
-
-void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
-{
-    if (topResults.size() < NN)
-        return;
-
-    std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
-    std::vector<std::pair<dist_t, idx_t>> returnlist;
-
-    while (topResults.size() > 0) {
-        resultSet.emplace(-topResults.top().first, topResults.top().second);
-        topResults.pop();
-    }
-
-    while (resultSet.size()) {
-        if (returnlist.size() >= NN)
-            break;
-        std::pair<dist_t, idx_t> curen = resultSet.top();
-        dist_t dist_to_query = -curen.first;
-        resultSet.pop();
-        bool good = true;
-        for (std::pair<dist_t, idx_t> curen2 : returnlist) {
-            dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
-                                         getDataByInternalId(curen.second));
-            if (curdist < dist_to_query) {
-                good = false;
-                break;
-            }
-        }
-        if (good) returnlist.push_back(curen);
-    }
-    for (std::pair<dist_t, idx_t> elem : returnlist)
-        topResults.emplace(-elem.first, elem.second);
-}
-
-void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
-                               std::priority_queue<std::pair<dist_t, idx_t>> topResults)
-{
-    getNeighborsByHeuristic(topResults, M);
-
-    std::vector<idx_t> res;
-    res.reserve(M);
-    while (topResults.size() > 0) {
-        res.push_back(topResults.top().second);
-        topResults.pop();
-    }
-    {
-        idx_t* data = get_linklist0(cur_c);
-        if (*data)
-            throw std::runtime_error("Should be blank");
-
-        *data++ = res.size();
-
-        for (size_t idx = 0; idx < res.size(); idx++) {
-            if (data[idx])
-                throw std::runtime_error("Should be blank");
-            data[idx] = res[idx];
-        }
-    }
-    for (size_t idx = 0; idx < res.size(); idx++) {
-        if (res[idx] == cur_c)
-            throw std::runtime_error("Connection to the same element");
-
-        size_t resMmax = maxM;
-        idx_t *ll_other = get_linklist0(res[idx]);
-        idx_t sz_link_list_other = *ll_other;
-
-        if (sz_link_list_other > resMmax || sz_link_list_other < 0)
-            throw std::runtime_error("Bad sz_link_list_other");
-
-        if (sz_link_list_other < resMmax) {
-            idx_t *data = ll_other + 1;
-            data[sz_link_list_other] = cur_c;
-            *ll_other = sz_link_list_other + 1;
-        } else {
-            // finding the "weakest" element to replace it with the new one
-            idx_t *data = ll_other + 1;
-            dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
-            // Heuristic:
-            std::priority_queue<std::pair<dist_t, idx_t>> candidates;
-            candidates.emplace(d_max, cur_c);
-
-            for (size_t j = 0; j < sz_link_list_other; j++)
-                candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
-
-            getNeighborsByHeuristic(candidates, resMmax);
-
-            size_t indx = 0;
-            while (!candidates.empty()) {
-                data[indx] = candidates.top().second;
-                candidates.pop();
-                indx++;
-            }
-            *ll_other = indx;
-        }
-    }
-}
-
-void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
-{
-    if (cur_element_count >= maxelements) {
-        throw std::runtime_error("The number of elements exceeds the specified limit");
-    }
-    idx_t cur_c = cur_element_count++;
-    memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
-    memcpy(getDataByInternalId(cur_c), point, data_size);
-    memcpy(getExternalLabel(cur_c), &label, sizeof label);
-
-    // Do nothing for the first element
-    if (cur_c != 0) {
-        std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
-        mutuallyConnectNewElement(point, cur_c, topResults);
-    }
-};
-
-std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
-{
-	std::priority_queue<std::pair<dist_t, label_t>> topResults;
-	auto topCandidates = searchBaseLayer(query, k);
-    while (topCandidates.size() > k) {
-        topCandidates.pop();
-	}
-	while (!topCandidates.empty()) {
-		std::pair<dist_t, idx_t> rez = topCandidates.top();
-		label_t label;
-		memcpy(&label, getExternalLabel(rez.second), sizeof(label));
-		topResults.push(std::pair<dist_t, label_t>(rez.first, label));
-		topCandidates.pop();
-	}
-
-    return topResults;
-};
-
-dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
-{
-    dist_t 	distance = 0.0;
-
-    for (size_t i = 0; i < n; i++)
-    {
-        dist_t diff = x[i] - y[i];
-        distance += diff * diff;
-    }
-    return distance;
-
-}
-
-#ifdef __x86_64__
-#include <immintrin.h>
-
-__attribute__((target("avx2")))
-dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
-{
-    const size_t TmpResSz = sizeof(__m256) / sizeof(float);
-    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
-    size_t qty16 = n / 16;
-    const float *pEnd1 = x + (qty16 * 16);
-    __m256 diff, v1, v2;
-    __m256 sum = _mm256_set1_ps(0);
-
-    while (x < pEnd1) {
-        v1 = _mm256_loadu_ps(x);
-        x += 8;
-        v2 = _mm256_loadu_ps(y);
-        y += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-        v1 = _mm256_loadu_ps(x);
-        x += 8;
-        v2 = _mm256_loadu_ps(y);
-        y += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-    }
-    _mm256_store_ps(TmpRes, sum);
-    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-    return (res);
-}
-
-dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
-{
-    const size_t TmpResSz = sizeof(__m128) / sizeof(float);
-    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
-    size_t qty16 = n / 16;
-    const float *pEnd1 = x + (qty16 * 16);
-
-    __m128 diff, v1, v2;
-    __m128 sum = _mm_set1_ps(0);
-
-    while (x < pEnd1) {
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-    }
-    _mm_store_ps(TmpRes, sum);
-    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    return res;
-}
-#endif
-
-dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
-{
-#ifndef __x86_64__
-    return fstdistfunc_scalar(x, y, dim);
-#else
-    if(use_avx2)
-        return fstdistfunc_avx2(x, y, dim);
-
-    return fstdistfunc_sse(x, y, dim);
-#endif
-}
-
-bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
-{
-	try
-	{
-		auto result = hnsw->searchKnn(point, efSearch);
-		size_t nResults = result.size();
-		*results = (label_t*)malloc(nResults*sizeof(label_t));
-		for (size_t i = nResults; i-- != 0;)
-		{
-			(*results)[i] = result.top().second;
-			result.pop();
-		}
-		*n_results = nResults;
-		return true;
-	}
-	catch (std::exception& x)
-	{
-		return false;
-	}
-}
-
-bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
-{
-	try
-	{
-		hnsw->addPoint(point, label);
-		return true;
-	}
-	catch (std::exception& x)
-	{
-		fprintf(stderr, "Catch %s\n", x.what());
-		return false;
-	}
-}
-
-void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
-{
-	new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
-}
-
-
-int hnsw_dimensions(HierarchicalNSW* hnsw)
-{
-	return (int)hnsw->dim;
-}
-
-size_t hnsw_count(HierarchicalNSW* hnsw)
-{
-	return hnsw->cur_element_count;
-}
-
-size_t hnsw_sizeof(void)
-{
-	return sizeof(HierarchicalNSW);
-}
diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h
deleted file mode 100644
index f38aeac362..0000000000
--- a/pgxn/hnsw/hnswalg.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <unordered_map>
-#include <unordered_set>
-#include <map>
-#include <cmath>
-#include <queue>
-#include <stdexcept>
-
-extern "C" {
-#include "hnsw.h"
-}
-
-struct HierarchicalNSW
-{
-	size_t maxelements;
-	size_t cur_element_count;
-
-	idx_t  enterpoint_node;
-
-	size_t dim;
-	size_t data_size;
-	size_t offset_data;
-	size_t offset_label;
-	size_t size_data_per_element;
-	size_t M;
-	size_t maxM;
-	size_t size_links_level0;
-	size_t efConstruction;
-
-#ifdef __x86_64__
-	bool	use_avx2;
-#endif
-
-	char   data_level0_memory[0]; // varying size
-
-  public:
-	HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
-	~HierarchicalNSW();
-
-
-	inline coord_t *getDataByInternalId(idx_t internal_id) const {
-		return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
-	}
-
-	inline idx_t *get_linklist0(idx_t internal_id) const {
-		return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
-	}
-
-	inline label_t *getExternalLabel(idx_t internal_id) const {
-		return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
-	}
-
-	std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
-
-	void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
-
-	void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
-
-	void addPoint(const coord_t *point, label_t label);
-
-	std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
-
-	dist_t fstdistfunc(const coord_t *x, const coord_t *y);
-};
diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out
deleted file mode 100644
index a1cee4525e..0000000000
--- a/pgxn/hnsw/test/expected/knn.out
+++ /dev/null
@@ -1,28 +0,0 @@
-SET enable_seqscan = off;
-CREATE TABLE t (val real[]);
-INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
-CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
-INSERT INTO t (val) VALUES (array[1,2,4]);
-explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
-                             QUERY PLAN                             
---------------------------------------------------------------------
- Index Scan using t_val_idx on t  (cost=4.02..8.06 rows=3 width=36)
-   Order By: (val <-> '{3,3,3}'::real[])
-(2 rows)
-
-SELECT * FROM t ORDER BY val <-> array[3,3,3];
-   val   
----------
- {1,2,3}
- {1,2,4}
- {1,1,1}
- {0,0,0}
-(4 rows)
-
-SELECT COUNT(*) FROM t;
- count 
--------
-     5
-(1 row)
-
-DROP TABLE t;
diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql
deleted file mode 100644
index 0635bda4a2..0000000000
--- a/pgxn/hnsw/test/sql/knn.sql
+++ /dev/null
@@ -1,13 +0,0 @@
-SET enable_seqscan = off;
-
-CREATE TABLE t (val real[]);
-INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
-CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
-
-INSERT INTO t (val) VALUES (array[1,2,4]);
-
-explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
-SELECT * FROM t ORDER BY val <-> array[3,3,3];
-SELECT COUNT(*) FROM t;
-
-DROP TABLE t;

From 8327f68043e692c77f70d6a6dafa463636c01578 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 11 Jan 2025 19:39:27 +0200
Subject: [PATCH 06/32] Minor cleanup of extension build commands (#10356)

There used to be some pg version dependencies in these extensions, but
now that there isn't, follow the simpler pattern used in other
extensions. No change in the produced images.
---
 compute/compute-node.Dockerfile | 34 ++++-----------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f4df507b74..449e12af5d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -976,22 +976,9 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
 
 FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
-# version 0.3.3 supports v17
 # last release v0.3.3 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_JSONSCHEMA_VERSION=0.3.3 \
-        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
@@ -1012,22 +999,9 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION
 
-# version 1.5.9 supports v17
 # last release v1.5.9 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_GRAPHQL_VERSION=1.5.9 \
-        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
-    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
+    echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \

From cd982a82ecfd2d96c7251ff7203df38d0ed44539 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 10:44:59 +0100
Subject: [PATCH 07/32] pageserver,safekeeper: increase heap profiling
 frequency to 2 MB (#10362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Currently, the heap profiling frequency is every 1 MB allocated. Taking
a profile stack trace takes about 1 µs, and allocating 1 MB takes about
15 µs, so the overhead is about 6.7% which is a bit high. This is a
fixed cost regardless of whether heap profiles are actually accessed.

## Summary of changes

Increase the heap profiling sample frequency from 1 MB to 2 MB, which
reduces the overhead to about 3.3%. This seems acceptable, considering
performance-sensitive code will avoid allocations as far as possible
anyway.
---
 pageserver/src/bin/pageserver.rs  | 6 ++++--
 safekeeper/benches/receive_wal.rs | 5 ++---
 safekeeper/src/bin/safekeeper.rs  | 6 ++++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 567a69da3b..921c6a5092 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "pageserver.pid";
 
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 996c4d9b8c..19c6662e74 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -21,14 +21,13 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
-/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
-/// This mirrors the configuration in bin/safekeeper.rs.
+/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 // Register benchmarks with Criterion.
 criterion_group!(
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 13f6e34575..bc7af02185 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,10 +51,12 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";

From 22a6460010490857c927d1218b21f81d2fb0c06d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 11:01:18 +0100
Subject: [PATCH 08/32] libs/utils: add `force` parameter for `/profile/cpu`
 (#10361)

## Problem

It's only possible to take one CPU profile at a time. With Grafana
continuous profiling, a (low-frequency) CPU profile will always be
running, making it hard to take an ad hoc CPU profile at the same time.

Resolves #10072.

## Summary of changes

Add a `force` parameter for `/profile/cpu` which will end and return an
already running CPU profile, starting a new one for the current caller.
---
 libs/utils/src/http/endpoint.rs | 46 ++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4b4aa88d6b..a6ba685447 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::sync::{mpsc, Mutex};
+use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
@@ -358,25 +358,41 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
         Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
         Some(frequency) => frequency,
     };
-
-    // Only allow one profiler at a time.
-    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
-    let _lock = PROFILE_LOCK
-        .try_lock()
-        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
 
     // Take the profile.
-    let report = tokio::task::spawn_blocking(move || {
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
+
+    let report = {
+        // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
+        // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
+        // for a lock(), to avoid races where the notify isn't currently awaited.
+        let _lock = loop {
+            match PROFILE_LOCK.try_lock() {
+                Ok(lock) => break lock,
+                Err(_) if force => PROFILE_CANCEL.notify_waiters(),
+                Err(_) => return Err(ApiError::Conflict("profiler already running".into())),
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
+        };
+
         let guard = ProfilerGuardBuilder::default()
             .frequency(frequency_hz)
             .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-            .build()?;
-        std::thread::sleep(Duration::from_secs(seconds));
-        guard.report().build()
-    })
-    .await
-    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?;
+
+        tokio::select! {
+            _ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
+            _ = PROFILE_CANCEL.notified() => {},
+        };
+
+        guard
+            .report()
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?
+    };
 
     // Return the report in the requested format.
     match format {

From de199d71e18888b059cc8846387d933b103f323e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 13 Jan 2025 10:34:36 +0000
Subject: [PATCH 09/32] chore: Address lints introduced in rust 1.85.0 beta
 (#10340)

With a new beta build of the rust compiler, it's good to check out the
new lints. Either to find false positives, or find flaws in our code.
Additionally, it helps reduce the effort required to update to 1.85 in 6
weeks.
---
 control_plane/src/local_env.rs                         |  1 -
 libs/pq_proto/src/lib.rs                               |  2 +-
 libs/utils/src/generation.rs                           |  6 +++---
 libs/utils/src/lsn.rs                                  |  2 +-
 pageserver/src/tenant/config.rs                        |  4 ++--
 pageserver/src/tenant/disk_btree.rs                    | 10 +++++-----
 pageserver/src/tenant/storage_layer/inmemory_layer.rs  |  4 ++--
 .../timeline/walreceiver/walreceiver_connection.rs     |  2 +-
 pageserver/src/walingest.rs                            |  2 +-
 9 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 5b82acb3a5..2fe4cd5202 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -483,7 +483,6 @@ impl LocalEnv {
             .iter()
             .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
             .map(|&(_, timeline_id)| timeline_id)
-            .map(TimelineId::from)
     }
 
     pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 94714359a3..50b2c69d24 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
 
 impl ProtocolVersion {
     pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
+        Self(((major as u32) << 16) | minor as u32)
     }
     pub const fn minor(self) -> u16 {
         self.0 as u16
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 5970836033..44565ee6a2 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -112,9 +112,9 @@ impl Serialize for Generation {
             // We should never be asked to serialize a None. Structures
             // that include an optional generation should convert None to an
             // Option<Generation>::None
-            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation ({self})",
-            ))
+            Err(serde::ser::Error::custom(format!(
+                "Tried to serialize invalid generation ({self:?})"
+            )))
         }
     }
 }
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index f188165600..c874fa30ff 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
         {
             let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
             let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
-            Ok(Lsn((left_num as u64) << 32 | right_num as u64))
+            Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
         } else {
             Err(LsnParseError)
         }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index d54dded778..edf2e6a3aa 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                 .map(humantime),
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
-            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index c77342b144..bb9df020b5 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -84,17 +84,17 @@ impl Value {
 
     fn to_u64(self) -> u64 {
         let b = &self.0;
-        (b[0] as u64) << 32
-            | (b[1] as u64) << 24
-            | (b[2] as u64) << 16
-            | (b[3] as u64) << 8
+        ((b[0] as u64) << 32)
+            | ((b[1] as u64) << 24)
+            | ((b[2] as u64) << 16)
+            | ((b[3] as u64) << 8)
             | b[4] as u64
     }
 
     fn to_blknum(self) -> u32 {
         let b = &self.0;
         assert!(b[0] == 0x80);
-        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
+        ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 71e53da20f..2b67f55a17 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
 ///
 /// Layout:
 /// - 1 bit: `will_init`
-/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
-/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct IndexEntry(u64);
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d74faa1af5..3a8796add8 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -403,7 +403,7 @@ pub(super) async fn handle_walreceiver_connection(
                 // need to advance last record LSN on all shards. If we've not ingested the latest
                 // record, then set the LSN of the modification past it. This way all shards
                 // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                let needs_last_record_lsn_advance = match next_record_lsn {
                     Some(lsn) if lsn > modification.get_lsn() => {
                         modification.set_lsn(lsn).unwrap();
                         true
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e5b23fed51..7253af8507 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -308,7 +308,7 @@ impl WalIngest {
             epoch -= 1;
         }
 
-        Ok((epoch as u64) << 32 | xid as u64)
+        Ok(((epoch as u64) << 32) | xid as u64)
     }
 
     async fn ingest_clear_vm_bits(

From 12053cf83260da4da46f4039d5bbfb4cbb8e57da Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 11:18:14 +0000
Subject: [PATCH 10/32] storage controller: improve consistency_check_api
 (#10363)

## Problem

Limitations found while using this to investigate
https://github.com/neondatabase/neon/issues/10234:
- If we hit a node consistency issue, we drop out and don't check shards
for consistency
- The messages printed after a shard consistency issue are huge, and
grafana appears to drop them.

## Summary of changes

- Defer node consistency errors until the end of the function, so that
we always proceed to check shards for consistency
- Print out smaller log lines that just point out the diffs between
expected and persistent state
---
 storage_controller/src/service.rs | 59 ++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 265b2798d2..430d884548 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5256,7 +5256,8 @@ impl Service {
         expect_nodes.sort_by_key(|n| n.node_id);
         nodes.sort_by_key(|n| n.node_id);
 
-        if nodes != expect_nodes {
+        // Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
+        let node_result = if nodes != expect_nodes {
             tracing::error!("Consistency check failed on nodes.");
             tracing::error!(
                 "Nodes in memory: {}",
@@ -5268,10 +5269,12 @@ impl Service {
                 serde_json::to_string(&nodes)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
-            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Node consistency failure"
-            )));
-        }
+            )))
+        } else {
+            Ok(())
+        };
 
         let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
         persistent_shards
@@ -5281,6 +5284,7 @@ impl Service {
 
         if persistent_shards != expect_shards {
             tracing::error!("Consistency check failed on shards.");
+
             tracing::error!(
                 "Shards in memory: {}",
                 serde_json::to_string(&expect_shards)
@@ -5291,12 +5295,57 @@ impl Service {
                 serde_json::to_string(&persistent_shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+
+            // The total dump log lines above are useful in testing but in the field grafana will
+            // usually just drop them because they're so large. So we also do some explicit logging
+            // of just the diffs.
+            let persistent_shards = persistent_shards
+                .into_iter()
+                .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
+                .collect::<HashMap<_, _>>();
+            let expect_shards = expect_shards
+                .into_iter()
+                .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
+                .collect::<HashMap<_, _>>();
+            for (tenant_shard_id, persistent_tsp) in &persistent_shards {
+                match expect_shards.get(tenant_shard_id) {
+                    None => {
+                        tracing::error!(
+                            "Shard {} found in database but not in memory",
+                            tenant_shard_id
+                        );
+                    }
+                    Some(expect_tsp) => {
+                        if expect_tsp != persistent_tsp {
+                            tracing::error!(
+                                "Shard {} is inconsistent.  In memory: {}, database has: {}",
+                                tenant_shard_id,
+                                serde_json::to_string(expect_tsp).unwrap(),
+                                serde_json::to_string(&persistent_tsp).unwrap()
+                            );
+                        }
+                    }
+                }
+            }
+
+            // Having already logged any differences, log any shards that simply aren't present in the database
+            for (tenant_shard_id, memory_tsp) in &expect_shards {
+                if !persistent_shards.contains_key(tenant_shard_id) {
+                    tracing::error!(
+                        "Shard {} found in memory but not in database: {}",
+                        tenant_shard_id,
+                        serde_json::to_string(memory_tsp)
+                            .map_err(|e| ApiError::InternalServerError(e.into()))?
+                    );
+                }
+            }
+
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Shard consistency failure"
             )));
         }
 
-        Ok(())
+        node_result
     }
 
     /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that

From 09fe3b025c42e56d78812e1ff329551e2da17720 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 Jan 2025 13:35:39 +0200
Subject: [PATCH 11/32] Add a websockets tunnel and a test for the proxy's
 websockets support. (#3823)

For testing the proxy's websockets support.

I wrote this to test https://github.com/neondatabase/neon/issues/3822.
Unfortunately, that bug can *not* be reproduced with this tunnel. The
bug only appears when the client pipelines the first query with the
authentication messages. The tunnel doesn't do that.

---

Update (@conradludgate 2025-01-10):

We have since added some websocket tests, but they manually implemented
a very simplistic setup of the postgres protocol. Introducing the tunnel
would make more complex testing simpler in the future.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 test_runner/regress/test_proxy_websockets.py |  55 +++++++
 test_runner/websocket_tunnel.py              | 154 +++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100755 test_runner/websocket_tunnel.py

diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index ea01252ce4..f14317a39f 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -1,10 +1,15 @@
 from __future__ import annotations
 
+import asyncio
 import ssl
 
+import asyncpg
 import pytest
+import websocket_tunnel
 import websockets
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonProxy
+from fixtures.port_distributor import PortDistributor
 
 
 @pytest.mark.asyncio
@@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
         # close
         await websocket.send(b"X\x00\x00\x00\x04")
         await websocket.wait_closed()
+
+
+@pytest.mark.asyncio
+async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor):
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    # Launch a tunnel service so that we can speak the websockets protocol to
+    # the proxy
+    tunnel_port = port_distributor.get_port()
+    tunnel_server = await websocket_tunnel.start_server(
+        "127.0.0.1",
+        tunnel_port,
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl_context,
+    )
+    log.info(f"websockets tunnel listening for connections on port {tunnel_port}")
+
+    async with tunnel_server:
+
+        async def run_tunnel():
+            try:
+                async with tunnel_server:
+                    await tunnel_server.serve_forever()
+            except Exception as e:
+                log.error(f"Error in tunnel task: {e}")
+
+        tunnel_task = asyncio.create_task(run_tunnel())
+
+        # Ok, the tunnel is now running. Check that we can connect to the proxy's
+        # websocket interface, through the tunnel
+        tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres"
+
+        log.info(f"connecting to {tunnel_connstring}")
+        conn = await asyncpg.connect(tunnel_connstring)
+        res = await conn.fetchval("SELECT 123")
+        assert res == 123
+        await conn.close()
+        log.info("Ran a query successfully through the tunnel")
+
+    tunnel_server.close()
+    try:
+        await tunnel_task
+    except asyncio.CancelledError:
+        pass
diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py
new file mode 100755
index 0000000000..facdb19140
--- /dev/null
+++ b/test_runner/websocket_tunnel.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+#
+# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP
+# connection on a port, and when you connect to it, it opens a websocket connection,
+# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary
+# frames.
+#
+# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too.
+#
+# Usage for manual testing:
+#
+# ## Launch Posgres on port 3000:
+# postgres -D data -p3000
+#
+# ## Launch proxy with WSS enabled:
+# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me'
+# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres
+#
+# ## Launch the tunnel:
+#
+# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me"
+#
+# ## Now you can connect with psql:
+# psql "postgresql://heikki@localhost:40433/postgres"
+#
+
+import argparse
+import asyncio
+import logging
+import ssl
+from ssl import Purpose
+
+import websockets
+from fixtures.log_helper import log
+
+
+# Enable verbose logging of all the traffic
+def enable_verbose_logging():
+    logger = logging.getLogger("websockets")
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(logging.StreamHandler())
+
+
+async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx):
+    server = await asyncio.start_server(
+        lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port
+    )
+    return server
+
+
+async def handle_tcp_to_websocket(tcp_reader, ws):
+    try:
+        while not tcp_reader.at_eof():
+            data = await tcp_reader.read(1024)
+
+            await ws.send(data)
+    except websockets.exceptions.ConnectionClosedError as e:
+        log.debug(f"connection closed: {e}")
+    except websockets.exceptions.ConnectionClosedOK:
+        log.debug("connection closed")
+    except Exception as e:
+        log.error(e)
+
+
+async def handle_websocket_to_tcp(ws, tcp_writer):
+    try:
+        async for message in ws:
+            tcp_writer.write(message)
+            await tcp_writer.drain()
+    except websockets.exceptions.ConnectionClosedError as e:
+        log.debug(f"connection closed: {e}")
+    except websockets.exceptions.ConnectionClosedOK:
+        log.debug("connection closed")
+    except Exception as e:
+        log.error(e)
+
+
+async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext):
+    try:
+        log.info("Received TCP connection. Connecting to websockets proxy.")
+
+        async with websockets.connect(ws_url, ssl=ctx) as ws:
+            try:
+                log.info("Connected to websockets proxy")
+
+                async with asyncio.TaskGroup() as tg:
+                    task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws))
+                    task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer))
+
+                    done, pending = await asyncio.wait(
+                        [task1, task2], return_when=asyncio.FIRST_COMPLETED
+                    )
+                    tcp_writer.close()
+                    await ws.close()
+
+            except* Exception as ex:
+                log.error(ex.exceptions)
+    except Exception as e:
+        log.error(e)
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tcp-listen-addr",
+        default="localhost",
+        help="TCP addr to listen on",
+    )
+    parser.add_argument(
+        "--tcp-listen-port",
+        default="40444",
+        help="TCP port to listen on",
+    )
+
+    parser.add_argument(
+        "--ws-url",
+        default="wss://localhost/",
+        help="websocket URL to connect to. This determines the Host header sent to the server",
+    )
+    parser.add_argument(
+        "--ws-host",
+        default="127.0.0.1",
+        help="websockets host to connect to",
+    )
+    parser.add_argument(
+        "--ws-port",
+        type=int,
+        default=443,
+        help="websockets port to connect to",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    if args.verbose:
+        enable_verbose_logging()
+
+    ctx = ssl.create_default_context(Purpose.SERVER_AUTH)
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+
+    server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx)
+    print(
+        f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}"
+    )
+    async with server:
+        await server.serve_forever()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 0b9032065ef1e140ad54e09e81aaf00a1b809e87 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 14:14:23 +0100
Subject: [PATCH 12/32] utils: allow 60-second CPU profiles (#10367)

Taking continuous profiles every 20 seconds is likely too expensive (in
dollar terms). Let's try 60-second profiles. We can now interrupt
running profiles via `?force=true`, so this should be fine.
---
 libs/utils/src/http/endpoint.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index a6ba685447..ca65c39ad6 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -350,8 +350,8 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
     };
     let seconds = match parse_query_param(&req, "seconds")? {
         None => 5,
-        Some(seconds @ 1..=30) => seconds,
-        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
+        Some(seconds @ 1..=60) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
     };
     let frequency_hz = match parse_query_param(&req, "frequency")? {
         None => 99,

From d1bc36f53688452dfacb7e38c546d49fce73c9f3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 13:31:57 +0000
Subject: [PATCH 13/32] storage controller: fix retries of compute hook
 notifications while a secondary node is offline (#10352)

## Problem

We would sometimes fail to retry compute notifications:
1. Try and send, set compute_notify_failure if we can't
2. On next reconcile, reconcile() fails for some other reason (e.g.
tried to talk to an offline node), and we fail the `result.is_ok() &&
must_notify` condition around the re-sending.

Closes: https://github.com/neondatabase/cloud/issues/22612

## Summary of changes

- Clarify the meaning of the reconcile result: it should be Ok(()) if
configuring attached location worked, even if secondary or detach
locations cannot be reached.
- Skip trying to talk to secondaries if they're offline
- Even if reconcile fails and we can't send the compute notification (we
can't send it because we're not sure if it's really attached), make sure
we save the `compute_notify_failure` flag so that subsequent reconciler
runs will try again
- Add a regression test for the above
---
 storage_controller/src/reconciler.rs          |  21 +++-
 storage_controller/src/tenant_shard.rs        |   7 +-
 .../regress/test_storage_controller.py        | 116 ++++++++++++++++++
 3 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index e0a854fff7..6f584e7267 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -696,6 +696,11 @@ impl Reconciler {
     /// First we apply special case handling (e.g. for live migrations), and then a
     /// general case reconciliation where we walk through the intent by pageserver
     /// and call out to the pageserver to apply the desired state.
+    ///
+    /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
+    /// all locations for the tenant are in the expected state. When nodes that are to be detached
+    /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
+    /// state where it still requires later reconciliation.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
         // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
         self.maybe_refresh_observed().await?;
@@ -784,10 +789,18 @@ impl Reconciler {
                     tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 _ => {
-                    // In all cases other than a matching observed configuration, we will
-                    // reconcile this location.
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
-                    changes.push((node.clone(), wanted_conf))
+                    // Only try and configure secondary locations on nodes that are available.  This
+                    // allows the reconciler to "succeed" while some secondaries are offline (e.g. after
+                    // a node failure, where the failed node will have a secondary intent)
+                    if node.is_available() {
+                        tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                        changes.push((node.clone(), wanted_conf))
+                    } else {
+                        tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
+                        self.observed
+                            .locations
+                            .insert(node.get_id(), ObservedStateLocation { conf: None });
+                    }
                 }
             }
         }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index c17989a316..e0a71b5822 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1122,10 +1122,15 @@ impl TenantShard {
         let result = reconciler.reconcile().await;
 
         // If we know we had a pending compute notification from some previous action, send a notification irrespective
-        // of whether the above reconcile() did any work
+        // of whether the above reconcile() did any work.  It has to be Ok() though, because otherwise we might be
+        // sending a notification of a location that isn't really attached.
         if result.is_ok() && must_notify {
             // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
             reconciler.compute_notify().await.ok();
+        } else if must_notify {
+            // Carry this flag so that the reconciler's result will indicate that it still needs to retry
+            // the compute hook notification eventually.
+            reconciler.compute_notify_failure = true;
         }
 
         // Update result counter
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index da6d5b8622..616aee758d 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook(
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("postgres behavior is not relevant")
+def test_storage_controller_compute_hook_retry(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address: ListenAddress,
+):
+    """
+    Test that when a reconciler can't do its compute hook notification, it will keep
+    trying until it succeeds.
+
+    Reproducer for https://github.com/neondatabase/cloud/issues/22612
+    """
+
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    handle_params = {"status": 200}
+
+    notifications = []
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached": 1}')
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
+        "tenant_id": str(tenant_id),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
+    }
+    assert notifications[0] == expect
+
+    # Block notifications, and fail a node
+    handle_params["status"] = 423
+    env.pageservers[0].stop()
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Avoid waiting for heartbeats
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later,
+    # and we will check that that happens
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Try reconciling again, it should try notifying again
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # The describe API should indicate that a notification is pending
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Unblock notifications: reconcile should work now
+    handle_params["status"] = 200
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 1
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+    # Reconciler should be idle now that it succeeded in its compute notification
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 0
+    assert len(notifications) == 0
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+
 @run_only_on_default_postgres("this test doesn't start an endpoint")
 def test_storage_controller_compute_hook_revert(
     httpserver: HTTPServer,

From b2d0e1a519616aea5e0d477c1f12940e44d2fee5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 13 Jan 2025 14:13:02 +0000
Subject: [PATCH 14/32] Link OpenSSL dynamically (#10302)

## Problem
Statically linked OpenSSL is buggy in multithreaded environment:
- https://github.com/neondatabase/cloud/issues/16155
- https://github.com/neondatabase/neon/issues/8275

## Summary of changes
- Link OpenSSL dynamically (revert OpenSSL part from
https://github.com/neondatabase/neon/pull/8074)

Before:
```
ldd /usr/local/v17/lib/libpq.so
        linux-vdso.so.1 (0x0000ffffb5ce4000)
        libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffb5c10000)
        libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffb5650000)
        /lib/ld-linux-aarch64.so.1 (0x0000ffffb5ca7000)
```

After:
```
ldd /usr/local/v17/lib/libpq.so
        linux-vdso.so.1 (0x0000ffffbf3e8000)
        libssl.so.3 => /lib/aarch64-linux-gnu/libssl.so.3 (0x0000ffffbf260000)
        libcrypto.so.3 => /lib/aarch64-linux-gnu/libcrypto.so.3 (0x0000ffffbec00000)
        libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffbf1c0000)
        libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffbea50000)
        /lib/ld-linux-aarch64.so.1 (0x0000ffffbf3ab000)
```
---
 Dockerfile             |  1 +
 Makefile               |  3 ---
 build-tools.Dockerfile | 15 ---------------
 3 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d3659f917a..2e4f8e5546 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,6 +71,7 @@ RUN set -e \
         ca-certificates \
 	# System postgres for use with client libraries (e.g. in storage controller)
         postgresql-15 \
+        openssl \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
     && chown -R neon:neon /data
diff --git a/Makefile b/Makefile
index 9cffc74508..22ebfea7d5 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
-OPENSSL_PREFIX_DIR := /usr/local/openssl
 ICU_PREFIX_DIR := /usr/local/icu
 
 #
@@ -26,11 +25,9 @@ endif
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
 	PG_CONFIGURE_OPTS += --with-icu
 	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
 	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
 endif
 
 UNAME_S := $(shell uname -s)
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 79210a2e1b..cf6439d004 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=1.1.1w
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make -j "$(nproc)" && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #

From b31ed0acd1c5e20281556de66fa52d797c156826 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 15:23:42 +0100
Subject: [PATCH 15/32] utils: add ?force=true hint for CPU profiler (#10368)

This makes it less annoying to try to take a CPU profile when a
continuous profile is already running.
---
 libs/utils/src/http/endpoint.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index ca65c39ad6..9f38373ca0 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -372,7 +372,11 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
             match PROFILE_LOCK.try_lock() {
                 Ok(lock) => break lock,
                 Err(_) if force => PROFILE_CANCEL.notify_waiters(),
-                Err(_) => return Err(ApiError::Conflict("profiler already running".into())),
+                Err(_) => {
+                    return Err(ApiError::Conflict(
+                        "profiler already running (use ?force=true to cancel it)".into(),
+                    ))
+                }
             }
             tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
         };

From ceacc29609058f7ba18288ac141e7c55a4f01382 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 13 Jan 2025 16:26:11 +0200
Subject: [PATCH 16/32] Start with minimal prefetch distance to minimize
 prefetch overhead for exact or limited index scans (#10359)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1736526089437179

In case of queries index scan with LIMIT clause, multiple backends can
concurrently send larger number of duplicated prefetch requests which
are not stored in LFC and so actually do useless job.

Current implementation of index prefetch starts with maximal prefetch
distance (10 by default now) when there are no key bounds, so in queries
with LIMIT clause like `select * from T order by pk limit 1` compute can
send a lot of useless prefetch requests to page server.

## Summary of changes

Always start with minimal prefetch distance even if there are not key
boundaries.

Related Postgres PRs:
https://github.com/neondatabase/postgres/pull/552
https://github.com/neondatabase/postgres/pull/551
https://github.com/neondatabase/postgres/pull/550
https://github.com/neondatabase/postgres/pull/549

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c2f65b3201..210a0ba3af 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73
+Subproject commit 210a0ba3afd8134ea910b203f274b165bd4f05d7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f262d631ad..d3141e17a7 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f262d631ad477a1819e84a183e5a7ef561830085
+Subproject commit d3141e17a7155e3d07c8deba4a10c748a29ba1e6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 97f9fde349..f63b141cfb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185
+Subproject commit f63b141cfb0c813725a6b2574049565bff643018
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 7e3f3974bc..9c9e9a78a9 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd
+Subproject commit 9c9e9a78a93aebec2f6a2f54644442d35ffa245c
diff --git a/vendor/revisions.json b/vendor/revisions.json
index bff2f70931..d182b88008 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "7e3f3974bc8895938308f94d0e96879ffae638cd"
+    "9c9e9a78a93aebec2f6a2f54644442d35ffa245c"
   ],
   "v16": [
     "16.6",
-    "97f9fde349c6de6d573f5ce96db07eca60ce6185"
+    "f63b141cfb0c813725a6b2574049565bff643018"
   ],
   "v15": [
     "15.10",
-    "f262d631ad477a1819e84a183e5a7ef561830085"
+    "d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
   ],
   "v14": [
     "14.15",
-    "c2f65b3201591e02ce45b66731392f98d3388e73"
+    "210a0ba3afd8134ea910b203f274b165bd4f05d7"
   ]
 }

From ef8bfacd6b1a9fa55923e0158ba096afc6090143 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 14:52:43 +0000
Subject: [PATCH 17/32] storage controller: API + CLI for migrating secondary
 locations (#10284)

## Problem

Currently, if we want to move a secondary there isn't a neat way to do
that: we just have migration API for the attached location, and it is
only clean to use that if you've manually created a secondary via
pageserver API in the place you're going to move it to.

Secondary migration API enables:
- Moving the secondary somewhere because we would like to later move the
attached location there.
- Move the secondary location because we just want to reclaim some disk
space from its current location.

## Summary of changes

- Add `/migrate_secondary` API
- Add `tenant-shard-migrate-secondary` CLI
- Add tests for above
---
 control_plane/src/storage_controller.rs       |  5 +-
 control_plane/storcon_cli/src/main.rs         | 31 ++++--
 libs/pageserver_api/src/controller_api.rs     |  1 -
 storage_controller/src/compute_hook.rs        | 10 +-
 storage_controller/src/http.rs                | 36 ++++++-
 storage_controller/src/service.rs             | 63 ++++++++++++
 .../regress/test_storage_controller.py        | 96 +++++++++++++++----
 7 files changed, 210 insertions(+), 32 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 22d2420ed4..c41ff22d15 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -822,10 +822,7 @@ impl StorageController {
         self.dispatch(
             Method::PUT,
             format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id,
-            }),
+            Some(TenantShardMigrateRequest { node_id }),
         )
         .await
     }
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 617b2cd1ba..1653b3c845 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -112,6 +112,13 @@ enum Command {
         #[arg(long)]
         node: NodeId,
     },
+    /// Migrate the secondary location for a tenant shard to a specific pageserver.
+    TenantShardMigrateSecondary {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
     /// Cancel any ongoing reconciliation for this shard
     TenantShardCancelReconcile {
         #[arg(long)]
@@ -540,10 +547,7 @@ async fn main() -> anyhow::Result<()> {
             tenant_shard_id,
             node,
         } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
+            let req = TenantShardMigrateRequest { node_id: node };
 
             storcon_client
                 .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -553,6 +557,20 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
+        Command::TenantShardMigrateSecondary {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest { node_id: node };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
+                    Some(req),
+                )
+                .await?;
+        }
         Command::TenantShardCancelReconcile { tenant_shard_id } => {
             storcon_client
                 .dispatch::<(), ()>(
@@ -915,10 +933,7 @@ async fn main() -> anyhow::Result<()> {
                             .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                 Method::PUT,
                                 format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
+                                Some(TenantShardMigrateRequest { node_id: mv.to }),
                             )
                             .await
                             .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 7eb3547183..ba0eb0e4ae 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -179,7 +179,6 @@ pub struct TenantDescribeResponseShard {
 /// specifies some constraints, e.g. asking it to get off particular node(s)
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
     pub node_id: NodeId,
 }
 
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 69db48f8d1..3884a6df46 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -124,7 +124,10 @@ impl ComputeHookTenant {
                 if let Some(shard_idx) = shard_idx {
                     sharded.shards.remove(shard_idx);
                 } else {
-                    tracing::warn!("Shard not found while handling detach")
+                    // This is a valid but niche case, where the tenant was previously attached
+                    // as a Secondary location and then detached, so has no previously notified
+                    // state.
+                    tracing::info!("Shard not found while handling detach")
                 }
             }
             ComputeHookTenant::Unsharded(_) => {
@@ -761,7 +764,10 @@ impl ComputeHook {
         let mut state_locked = self.state.lock().unwrap();
         match state_locked.entry(tenant_shard_id.tenant_id) {
             Entry::Vacant(_) => {
-                tracing::warn!("Compute hook tenant not found for detach");
+                // This is a valid but niche case, where the tenant was previously attached
+                // as a Secondary location and then detached, so has no previously notified
+                // state.
+                tracing::info!("Compute hook tenant not found for detach");
             }
             Entry::Occupied(mut e) => {
                 let sharded = e.get().is_sharded();
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5385e4ee0b..c8df4ffe28 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -690,7 +690,8 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     };
 
     let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
+    let mut nodes = state.service.node_list().await?;
+    nodes.sort_by_key(|n| n.get_id());
     let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
 
     json_response(StatusCode::OK, api_nodes)
@@ -1005,6 +1006,29 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_shard_migrate_secondary(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_shard_migrate_secondary(tenant_shard_id, migrate_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_shard_cancel_reconcile(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1855,6 +1879,16 @@ pub fn make_router(
                 RequestName("control_v1_tenant_migrate"),
             )
         })
+        .put(
+            "/control/v1/tenant/:tenant_shard_id/migrate_secondary",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_shard_migrate_secondary,
+                    RequestName("control_v1_tenant_migrate_secondary"),
+                )
+            },
+        )
         .put(
             "/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
             |r| {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 430d884548..8aa263f0c3 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5055,6 +5055,69 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    pub(crate) async fn tenant_shard_migrate_secondary(
+        &self,
+        tenant_shard_id: TenantShardId,
+        migrate_req: TenantShardMigrateRequest,
+    ) -> Result<TenantShardMigrateResponse, ApiError> {
+        let waiter = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if !node.is_available() {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to unavailable node {node}");
+            }
+
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant shard not found").into(),
+                ));
+            };
+
+            if shard.intent.get_secondary().len() == 1
+                && shard.intent.get_secondary()[0] == migrate_req.node_id
+            {
+                tracing::info!(
+                    "Migrating secondary to {node}: intent is unchanged {:?}",
+                    shard.intent
+                );
+            } else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
+                tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
+            } else {
+                let old_secondaries = shard.intent.get_secondary().clone();
+                for secondary in old_secondaries {
+                    shard.intent.remove_secondary(scheduler, secondary);
+                }
+
+                shard.intent.push_secondary(scheduler, migrate_req.node_id);
+                shard.sequence = shard.sequence.next();
+                tracing::info!(
+                    "Migrating secondary to {node}: new intent {:?}",
+                    shard.intent
+                );
+            }
+
+            self.maybe_reconcile_shard(shard, nodes)
+        };
+
+        if let Some(waiter) = waiter {
+            waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
+        } else {
+            tracing::info!("Migration is a no-op");
+        }
+
+        Ok(TenantShardMigrateResponse {})
+    }
+
     /// 'cancel' in this context means cancel any ongoing reconcile
     pub(crate) async fn tenant_shard_cancel_reconcile(
         &self,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 616aee758d..3a55e75589 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1052,7 +1052,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     that just hits the endpoints to check that they don't bitrot.
     """
 
-    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_start()
 
     tenant_id = TenantId.generate()
@@ -1077,7 +1077,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         "GET", f"{env.storage_controller_api}/debug/v1/scheduler"
     )
     # Two nodes, in a dict of node_id->node
-    assert len(response.json()["nodes"]) == 2
+    assert len(response.json()["nodes"]) == 3
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
@@ -1088,13 +1088,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
 
+    # Secondary migration API: superficial check that it migrates
+    secondary_dest = env.pageservers[2].id
+    env.storage_controller.request(
+        "PUT",
+        f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+        json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest},
+    )
+    assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [
+        secondary_dest
+    ]
+
     # Node unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
-    assert len(env.storage_controller.node_list()) == 1
+    assert len(env.storage_controller.node_list()) == 2
 
     # Tenant unclean drop API
     response = env.storage_controller.request(
@@ -1812,7 +1824,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     """
     output_dir = neon_env_builder.test_output_dir
     shard_count = 4
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count)
+
     base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
 
     def storcon_cli(args):
@@ -1841,7 +1859,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # List nodes
     node_lines = storcon_cli(["nodes"])
     # Table header, footer, and one line of data
-    assert len(node_lines) == 5
+    assert len(node_lines) == 7
     assert "localhost" in node_lines[3]
 
     # Pause scheduling onto a node
@@ -1859,10 +1877,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
     assert "Offline" in storcon_cli(["nodes"])[3]
 
+    # Restore node, verify status changes in CLI output
+    env.pageservers[0].start()
+
+    def is_online():
+        assert "Offline" not in storcon_cli(["nodes"])
+
+    wait_until(is_online)
+
+    # Let everything stabilize after node failure to avoid interfering with subsequent steps
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+
     # List tenants
     tenant_lines = storcon_cli(["tenants"])
     assert len(tenant_lines) == 5
-    assert str(env.initial_tenant) in tenant_lines[3]
+    assert str(tenant_id) in tenant_lines[3]
 
     # Setting scheduling policies intentionally result in warnings, they're for rare use.
     env.storage_controller.allowed_errors.extend(
@@ -1870,23 +1899,58 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     )
 
     # Describe a tenant
-    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
+    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)])
     assert len(tenant_lines) >= 3 + shard_count * 2
-    assert str(env.initial_tenant) in tenant_lines[0]
+    assert str(tenant_id) in tenant_lines[0]
+
+    # Migrate an attached location
+    def other_ps_id(current_ps_id):
+        return (
+            env.pageservers[0].id
+            if current_ps_id == env.pageservers[1].id
+            else env.pageservers[1].id
+        )
+
+    storcon_cli(
+        [
+            "tenant-shard-migrate",
+            "--tenant-shard-id",
+            f"{tenant_id}-0004",
+            "--node",
+            str(
+                other_ps_id(
+                    env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"]
+                )
+            ),
+        ]
+    )
+
+    # Migrate a secondary location
+    storcon_cli(
+        [
+            "tenant-shard-migrate-secondary",
+            "--tenant-shard-id",
+            f"{tenant_id}-0004",
+            "--node",
+            str(
+                other_ps_id(
+                    env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+                        "node_secondary"
+                    ][0]
+                )
+            ),
+        ]
+    )
 
     # Pause changes on a tenant
-    storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
+    storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"])
     assert "Stop" in storcon_cli(["tenants"])[3]
 
     # Cancel ongoing reconcile on a tenant
-    storcon_cli(
-        ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
-    )
+    storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"])
 
     # Change a tenant's placement
-    storcon_cli(
-        ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
-    )
+    storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"])
     assert "Secondary" in storcon_cli(["tenants"])[3]
 
     # Modify a tenant's config
@@ -1894,7 +1958,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
         [
             "patch-tenant-config",
             "--tenant-id",
-            str(env.initial_tenant),
+            str(tenant_id),
             "--config",
             json.dumps({"pitr_interval": "1m"}),
         ]

From 96243af651f67747c9eeaf800f50777c75454143 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 Jan 2025 17:01:13 +0200
Subject: [PATCH 18/32] Stop building unnecessary extension tarballs (#10355)

We build "custom extensions" from a different repository nowadays.
---
 compute/compute-node.Dockerfile | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 449e12af5d..f346f402d4 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -170,7 +170,6 @@ RUN case "${PG_VERSION}" in \
     wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
     echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -220,11 +219,7 @@ RUN case "${PG_VERSION}" in \
     cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
     ninja -j $(getconf _NPROCESSORS_ONLN) && \
     ninja -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
 
 #########################################################################################
 #
@@ -842,13 +837,8 @@ RUN case "${PG_VERSION}" in "v17") \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
 
 #########################################################################################
 #

From a338aee132497193c3587acd267babac66758d01 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 13 Jan 2025 15:20:46 +0000
Subject: [PATCH 19/32] feat(local_proxy): use ed25519 signatures with
 pg_session_jwt (#10290)

Generally ed25519 seems to be much preferred for cryptographic strength
to P256 nowadays, and it is NIST approved finally. We should use it
where we can as it's also faster than p256.

This PR makes the re-signed JWTs between local_proxy and pg_session_jwt
use ed25519.

This does introduce a new dependency on ed25519, but I do recall some
Neon Authorise customers asking for support for ed25519, so I am
justifying this dependency addition in the context that we can then
introduce support for customer ed25519 keys

sources:
* https://csrc.nist.gov/pubs/fips/186-5/final subsection 7 (EdDSA)
* https://datatracker.ietf.org/doc/html/rfc8037#section-3.1
---
 Cargo.lock                              | 55 +++++++++++++++++++++++++
 compute/compute-node.Dockerfile         |  4 +-
 proxy/Cargo.toml                        |  1 +
 proxy/src/serverless/backend.rs         | 16 ++++---
 proxy/src/serverless/local_conn_pool.rs | 28 +++++++------
 5 files changed, 85 insertions(+), 19 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f727741883..08453120c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1605,6 +1605,32 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "rustc_version",
+ "subtle",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.1"
@@ -1875,6 +1901,28 @@ dependencies = [
  "spki 0.7.3",
 ]
 
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "signature 2.2.0",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "rand_core 0.6.4",
+ "sha2",
+ "subtle",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -2113,6 +2161,12 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "fiat-crypto"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -4745,6 +4799,7 @@ dependencies = [
  "consumption_metrics",
  "dashmap 5.5.0",
  "ecdsa 0.16.9",
+ "ed25519-dalek",
  "env_logger 0.10.2",
  "fallible-iterator",
  "flate2",
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f346f402d4..2d38796d77 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1055,8 +1055,8 @@ ARG PG_VERSION
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2f63ee3acc..f362a45035 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
 signature = "2"
 ecdsa = "0.16"
 p256 = { version = "0.13", features = ["jwk"] }
+ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] }
 rsa = "0.9"
 
 workspace_hack.workspace = true
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b398c3ddd0..6d5fb13681 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -3,9 +3,9 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
+use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
-use p256::ecdsa::SigningKey;
-use p256::elliptic_curve::JwkEcKey;
+use jose_jwk::jose_b64;
 use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
 use tracing::field::display;
@@ -354,9 +354,15 @@ impl PoolingBackend {
     }
 }
 
-fn create_random_jwk() -> (SigningKey, JwkEcKey) {
-    let key = SigningKey::random(&mut OsRng);
-    let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
+fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
+    let key = SigningKey::generate(&mut OsRng);
+
+    let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
+        crv: jose_jwk::OkpCurves::Ed25519,
+        x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
+        d: None,
+    });
+
     (key, jwk)
 }
 
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c51a2bc9ba..fe33f0ff65 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -16,17 +16,16 @@ use std::sync::Arc;
 use std::task::{ready, Poll};
 use std::time::Duration;
 
+use ed25519_dalek::{Signature, Signer, SigningKey};
 use futures::future::poll_fn;
 use futures::Future;
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
-use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
 use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
-use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
@@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.1.2";
+pub(crate) const EXT_VERSION: &str = "0.2.0";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 #[derive(Clone)]
@@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     let cap = jwt.capacity();
 
     // we only need an empty header with the alg specified.
-    // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
-    jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
+    // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9"
+    jwt.push_str("eyJhbGciOiJFZERTQSJ9.");
 
     // encode the jwt payload in-place
     base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
@@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
-    use p256::ecdsa::SigningKey;
+    use ed25519_dalek::SigningKey;
     use typed_json::json;
 
     use super::resign_jwt;
 
     #[test]
     fn jwt_token_snapshot() {
-        let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
+        let key = SigningKey::from_bytes(&[1; 32]);
         let data =
             json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
 
@@ -381,12 +380,17 @@ mod tests {
 
         // To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
         // In the public-key box, paste the following jwk public key
-        // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
+        // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}`
+        // Note - jwt.io doesn't support EdDSA :(
+        // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509
 
-        // let pub_key = p256::ecdsa::VerifyingKey::from(&key);
-        // let pub_key = p256::PublicKey::from(pub_key);
-        // println!("{}", pub_key.to_jwk_string());
+        // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
+        //     crv: jose_jwk::OkpCurves::Ed25519,
+        //     x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
+        //     d: None,
+        // });
+        // println!("{}", serde_json::to_string(&jwk).unwrap());
 
-        assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
+        assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg");
     }
 }

From e9ed53b14f2a7feef862606626ae7d790a1346ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:43:01 -0500
Subject: [PATCH 20/32] feat(pageserver): support inherited sparse keyspace
 (#10313)

## Problem

In preparation to https://github.com/neondatabase/neon/issues/9516. We
need to store rel size and directory data in the sparse keyspace, but it
does not support inheritance yet.

## Summary of changes

Add a new type of keyspace "sparse but inherited" into the system.

On the read path: we don't remove the key range when we descend into the
ancestor. The search will stop when (1) the full key range is covered by
image layers (which has already been implemented before), or (2) we
reach the end of the ancestor chain.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs         |  39 ++++++-
 pageserver/src/tenant.rs               | 135 ++++++++++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs |   4 +-
 pageserver/src/tenant/timeline.rs      |  25 ++++-
 4 files changed, 193 insertions(+), 10 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index f0cd713c38..328dea5dec 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
 impl Key {
     // AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,7 +714,42 @@ impl Key {
     // switch (and generally it likely should be optional), so ignore these.
     #[inline(always)]
     pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+        if self.is_sparse() {
+            self.is_inherited_sparse_key()
+        } else {
+            !NON_INHERITED_RANGE.contains(&self)
+        }
+    }
+
+    #[inline(always)]
+    pub fn is_sparse(self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+    }
+
+    /// Check if the key belongs to the inherited keyspace.
+    fn is_inherited_sparse_key(self) -> bool {
+        debug_assert!(self.is_sparse());
+        self.field1 == RELATION_SIZE_PREFIX
+    }
+
+    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
+        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REPL_ORIGIN_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
     }
 
     #[inline(always)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2928c435cb..070593b104 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5682,7 +5682,7 @@ mod tests {
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use pageserver_api::value::Value;
@@ -7741,7 +7741,18 @@ mod tests {
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
         let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
+        let base_inherited_key_child =
+            Key::from_hex("610000000033333333444444445500000001").unwrap();
+        let base_inherited_key_nonexist =
+            Key::from_hex("610000000033333333444444445500000002").unwrap();
+        let base_inherited_key_overwrite =
+            Key::from_hex("610000000033333333444444445500000003").unwrap();
+
         assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
 
         let tline = tenant
             .create_test_timeline_with_layers(
@@ -7750,7 +7761,18 @@ mod tests {
                 DEFAULT_PG_VERSION,
                 &ctx,
                 Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                vec![(
+                    Lsn(0x20),
+                    vec![
+                        (base_inherited_key, test_img("metadata inherited key 1")),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 1a"),
+                        ),
+                        (base_key, test_img("metadata key 1")),
+                        (base_key_overwrite, test_img("metadata key overwrite 1b")),
+                    ],
+                )], // image layers
                 Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
             )
             .await?;
@@ -7764,7 +7786,18 @@ mod tests {
                 Vec::new(), // delta layers
                 vec![(
                     Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
+                    vec![
+                        (
+                            base_inherited_key_child,
+                            test_img("metadata inherited key 2"),
+                        ),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 2a"),
+                        ),
+                        (base_key_child, test_img("metadata key 2")),
+                        (base_key_overwrite, test_img("metadata key overwrite 2b")),
+                    ],
                 )], // image layers
                 Lsn(0x30),
             )
@@ -7786,6 +7819,26 @@ mod tests {
             get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
             None
         );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1a"))
+        );
 
         // test vectored get on child timeline
         assert_eq!(
@@ -7800,6 +7853,82 @@ mod tests {
             get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
             None
         );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2a"))
+        );
+
+        // test vectored scan on parent timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = tline
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 1a")
+                ),
+                (base_key, test_img("metadata key 1")),
+                (base_key_overwrite, test_img("metadata key overwrite 1b")),
+            ]
+        );
+
+        // test vectored scan on child timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = child
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_child,
+                    test_img("metadata inherited key 2")
+                ),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 2a")
+                ),
+                (base_key_child, test_img("metadata key 2")),
+                (base_key_overwrite, test_img("metadata key overwrite 2b")),
+            ]
+        );
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index b8206fca5a..3913637ca0 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
             .keys
             .entry(*key)
             .or_insert(Ok(VectoredValueReconstructState::default()));
-        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
+        let is_sparse_key = key.is_sparse();
         if let Ok(state) = state {
             let key_done = match state.situation {
                 ValueReconstructSituation::Complete => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c1b71262e0..f7227efeba 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,7 +27,7 @@ use pageserver_api::{
     config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -3221,7 +3221,7 @@ impl Timeline {
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
             keyspace.remove_overlapping_with(&KeySpace {
-                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+                ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
             });
 
             // Keyspace is fully retrieved
@@ -3242,7 +3242,11 @@ impl Timeline {
             // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
-            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            // Do not fire missing key error for sparse keys.
+            removed.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
             if !removed.is_empty() {
                 break Some(removed);
             }
@@ -3257,6 +3261,21 @@ impl Timeline {
             timeline = &*timeline_owned;
         };
 
+        // Remove sparse keys from the keyspace so that it doesn't fire errors.
+        let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
+            let mut missing_keyspace = missing_keyspace;
+            missing_keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
+            if missing_keyspace.is_empty() {
+                None
+            } else {
+                Some(missing_keyspace)
+            }
+        } else {
+            None
+        };
+
         if let Some(missing_keyspace) = missing_keyspace {
             return Err(GetVectoredError::MissingKey(MissingKeyError {
                 key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */

From fd1368d31e2d159a518ac78dd6d5146a45a6de72 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 19:33:00 +0000
Subject: [PATCH 21/32] storcon: rework scheduler optimisation, prioritize AZ
 (#9916)

## Problem

We want to do a more robust job of scheduling tenants into their home
AZ: https://github.com/neondatabase/neon/issues/8264.

Closes:  https://github.com/neondatabase/neon/issues/8969

## Summary of changes

### Scope

This PR combines prioritizing AZ with a larger rework of how we do
optimisation. The rationale is that just bumping AZ in the order of
Score attributes is a very tiny change: the interesting part is lining
up all the optimisation logic to respect this properly, which means
rewriting it to use the same scores as the scheduler, rather than the
fragile hand-crafted logic that we had before. Separating these changes
out is possible, but would involve doing two rounds of test updates
instead of one.

### Scheduling optimisation

`TenantShard`'s `optimize_attachment` and `optimize_secondary` methods
now both use the scheduler to pick a new "favourite" location. Then
there is some refined logic for whether + how to migrate to it:
- To decide if a new location is sufficiently "better", we generate
scores using some projected ScheduleContexts that exclude the shard
under consideration, so that we avoid migrating from a node with
AffinityScore(2) to a node with AffinityScore(1), only to migrate back
later.
- Score types get a `for_optimization` method so that when we compare
scores, we will only do an optimisation if the scores differ by their
highest-ranking attributes, not just because one pageserver is lower in
utilization. Eventually we _will_ want a mode that does this, but doing
it here would make scheduling logic unstable and harder to test, and to
do this correctly one needs to know the size of the tenant that one is
migrating.
- When we find a new attached location that we would like to move to, we
will create a new secondary location there, even if we already had one
on some other node. This handles the case where we have a home AZ A, and
want to migrate the attachment between pageservers in that AZ while
retaining a secondary location in some other AZ as well.
- A unit test is added for
https://github.com/neondatabase/neon/issues/8969, which is implicitly
fixed by reworking optimisation to use the same scheduling scores as
scheduling.
---
 libs/pageserver_api/src/controller_api.rs     |  10 +
 storage_controller/Cargo.toml                 |   2 +-
 storage_controller/src/drain_utils.rs         |   2 +-
 storage_controller/src/reconciler.rs          |  16 +-
 storage_controller/src/scheduler.rs           | 599 ++++++++---
 storage_controller/src/service.rs             | 244 ++++-
 .../src/service/context_iterator.rs           |   9 +-
 storage_controller/src/tenant_shard.rs        | 978 +++++++++++++-----
 .../performance/test_sharding_autosplit.py    |  21 +-
 .../test_storage_controller_scale.py          | 115 +-
 test_runner/regress/test_sharding.py          |  32 +-
 .../regress/test_storage_controller.py        |  40 +-
 12 files changed, 1598 insertions(+), 470 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index ba0eb0e4ae..f3aefc6df9 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -367,6 +367,16 @@ pub enum PlacementPolicy {
     Detached,
 }
 
+impl PlacementPolicy {
+    pub fn want_secondaries(&self) -> usize {
+        match self {
+            PlacementPolicy::Attached(secondary_count) => *secondary_count,
+            PlacementPolicy::Secondary => 1,
+            PlacementPolicy::Detached => 0,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 5f3319512d..caaa22d0a5 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" }
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
\ No newline at end of file
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
index 47f4276ff2..8b7be88078 100644
--- a/storage_controller/src/drain_utils.rs
+++ b/storage_controller/src/drain_utils.rs
@@ -112,7 +112,7 @@ impl TenantShardDrain {
             }
         }
 
-        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
+        match tenant_shard.preferred_secondary(scheduler) {
             Some(node) => Some(node),
             None => {
                 tracing::warn!(
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 6f584e7267..adced3b77d 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -826,7 +826,21 @@ impl Reconciler {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(&node, conf, None, false).await?;
+            // We only try to configure secondary locations if the node is available.  This does
+            // not stop us succeeding with the reconcile, because our core goal is to make the
+            // shard _available_ (the attached location), and configuring secondary locations
+            // can be done lazily when the node becomes available (via background reconciliation).
+            if node.is_available() {
+                self.location_config(&node, conf, None, false).await?;
+            } else {
+                // If the node is unavailable, we skip and consider the reconciliation successful: this
+                // is a common case where a pageserver is marked unavailable: we demote a location on
+                // that unavailable pageserver to secondary.
+                tracing::info!("Skipping configuring secondary location {node}, it is unavailable");
+                self.observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: None });
+            }
         }
 
         // The condition below identifies a detach. We must have no attached intent and
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 51a4cf35be..04a594dcac 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode {
     shard_count: usize,
     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
     attached_shard_count: usize,
+    /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
+    /// is in their preferred AZ (i.e. this is their 'home' location)
+    home_shard_count: usize,
     /// Availability zone id in which the node resides
     az: AvailabilityZone,
 
@@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
         preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Option<Self>;
+
+    /// Return a score that drops any components based on node utilization: this is useful
+    /// for finding scores for scheduling optimisation, when we want to avoid rescheduling
+    /// shards due to e.g. disk usage, to avoid flapping.
+    fn for_optimization(&self) -> Self;
+
     fn is_overloaded(&self) -> bool;
     fn node_id(&self) -> NodeId;
 }
@@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch {
 /// Ordering is given by member declaration order (top to bottom).
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
 pub(crate) struct NodeAttachmentSchedulingScore {
-    /// The number of shards belonging to the tenant currently being
-    /// scheduled that are attached to this node.
-    affinity_score: AffinityScore,
     /// Flag indicating whether this node matches the preferred AZ
     /// of the shard. For equal affinity scores, nodes in the matching AZ
     /// are considered first.
     az_match: AttachmentAzMatch,
-    /// Size of [`ScheduleContext::attached_nodes`] for the current node.
-    /// This normally tracks the number of attached shards belonging to the
-    /// tenant being scheduled that are already on this node.
-    attached_shards_in_context: usize,
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
     /// Utilisation score that combines shard count and disk utilisation
     utilization_score: u64,
     /// Total number of shards attached to this node. When nodes have identical utilisation, this
@@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
             az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
-            attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
             utilization_score: utilization.cached_score(),
             total_attached_shard_count: node.attached_shard_count,
             node_id: *node_id,
         })
     }
 
+    /// For use in scheduling optimisation, where we only want to consider the aspects
+    /// of the score that can only be resolved by moving things (such as inter-shard affinity
+    /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
+    /// can fluctuate for other reasons)
+    fn for_optimization(&self) -> Self {
+        Self {
+            utilization_score: 0,
+            total_attached_shard_count: 0,
+            node_id: NodeId(0),
+            ..*self
+        }
+    }
+
     fn is_overloaded(&self) -> bool {
         PageserverUtilization::is_overloaded(self.utilization_score)
     }
@@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore {
     affinity_score: AffinityScore,
     /// Utilisation score that combines shard count and disk utilisation
     utilization_score: u64,
-    /// Total number of shards attached to this node. When nodes have identical utilisation, this
-    /// acts as an anti-affinity between attached shards.
-    total_attached_shard_count: usize,
+    /// Anti-affinity with other non-home locations: this gives the behavior that secondaries
+    /// will spread out across the nodes in an AZ.
+    total_non_home_shard_count: usize,
     /// Convenience to make selection deterministic in tests and empty systems
     node_id: NodeId,
 }
@@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
             utilization_score: utilization.cached_score(),
-            total_attached_shard_count: node.attached_shard_count,
+            total_non_home_shard_count: (node.shard_count - node.home_shard_count),
             node_id: *node_id,
         })
     }
 
+    fn for_optimization(&self) -> Self {
+        Self {
+            utilization_score: 0,
+            total_non_home_shard_count: 0,
+            node_id: NodeId(0),
+            ..*self
+        }
+    }
+
     fn is_overloaded(&self) -> bool {
         PageserverUtilization::is_overloaded(self.utilization_score)
     }
@@ -293,6 +319,10 @@ impl AffinityScore {
     pub(crate) fn inc(&mut self) {
         self.0 += 1;
     }
+
+    pub(crate) fn dec(&mut self) {
+        self.0 -= 1;
+    }
 }
 
 impl std::ops::Add for AffinityScore {
@@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext {
     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
 
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-
     pub(crate) mode: ScheduleMode,
 }
 
@@ -334,7 +361,6 @@ impl ScheduleContext {
     pub(crate) fn new(mode: ScheduleMode) -> Self {
         Self {
             nodes: HashMap::new(),
-            attached_nodes: HashMap::new(),
             mode,
         }
     }
@@ -348,25 +374,31 @@ impl ScheduleContext {
         }
     }
 
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
+    /// Remove `shard`'s contributions to this context.  This is useful when considering scheduling
+    /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
+    pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
+        let mut new_context = self.clone();
+
+        if let Some(attached) = shard.intent.get_attached() {
+            if let Some(score) = new_context.nodes.get_mut(attached) {
+                score.dec();
+            }
+        }
+
+        for secondary in shard.intent.get_secondary() {
+            if let Some(score) = new_context.nodes.get_mut(secondary) {
+                score.dec();
+            }
+        }
+
+        new_context
     }
 
+    /// For test, track the sum of AffinityScore values, which is effectively how many
+    /// attached or secondary locations have been registered with this context.
     #[cfg(test)]
-    pub(crate) fn attach_count(&self) -> usize {
-        self.attached_nodes.values().sum()
+    pub(crate) fn location_count(&self) -> usize {
+        self.nodes.values().map(|i| i.0).sum()
     }
 }
 
@@ -388,6 +420,7 @@ impl Scheduler {
                 SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 },
@@ -415,6 +448,7 @@ impl Scheduler {
                 SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 },
@@ -427,6 +461,9 @@ impl Scheduler {
                     Some(node) => {
                         node.shard_count += 1;
                         node.attached_shard_count += 1;
+                        if Some(&node.az) == shard.preferred_az() {
+                            node.home_shard_count += 1;
+                        }
                     }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
@@ -438,7 +475,12 @@ impl Scheduler {
 
             for node_id in shard.intent.get_secondary() {
                 match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
+                    Some(node) => {
+                        node.shard_count += 1;
+                        if Some(&node.az) == shard.preferred_az() {
+                            node.home_shard_count += 1;
+                        }
+                    }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
                         shard.tenant_shard_id,
@@ -482,13 +524,20 @@ impl Scheduler {
     ///
     /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
     /// [`Self::new`] or [`Self::node_upsert`])
-    pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
+    pub(crate) fn update_node_ref_counts(
+        &mut self,
+        node_id: NodeId,
+        preferred_az: Option<&AvailabilityZone>,
+        update: RefCountUpdate,
+    ) {
         let Some(node) = self.nodes.get_mut(&node_id) else {
             debug_assert!(false);
             tracing::error!("Scheduler missing node {node_id}");
             return;
         };
 
+        let is_home_az = Some(&node.az) == preferred_az;
+
         match update {
             RefCountUpdate::PromoteSecondary => {
                 node.attached_shard_count += 1;
@@ -496,19 +545,31 @@ impl Scheduler {
             RefCountUpdate::Attach => {
                 node.shard_count += 1;
                 node.attached_shard_count += 1;
+                if is_home_az {
+                    node.home_shard_count += 1;
+                }
             }
             RefCountUpdate::Detach => {
                 node.shard_count -= 1;
                 node.attached_shard_count -= 1;
+                if is_home_az {
+                    node.home_shard_count -= 1;
+                }
             }
             RefCountUpdate::DemoteAttached => {
                 node.attached_shard_count -= 1;
             }
             RefCountUpdate::AddSecondary => {
                 node.shard_count += 1;
+                if is_home_az {
+                    node.home_shard_count += 1;
+                }
             }
             RefCountUpdate::RemoveSecondary => {
                 node.shard_count -= 1;
+                if is_home_az {
+                    node.home_shard_count -= 1;
+                }
             }
         }
 
@@ -594,6 +655,7 @@ impl Scheduler {
                 entry.insert(SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 });
@@ -607,33 +669,20 @@ impl Scheduler {
         }
     }
 
-    /// Where we have several nodes to choose from, for example when picking a secondary location
-    /// to promote to an attached location, this method may be used to pick the best choice based
-    /// on the scheduler's knowledge of utilization and availability.
-    ///
-    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
-    /// caller can pick a node some other way.
-    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
-        if nodes.is_empty() {
-            return None;
-        }
-
-        // TODO: When the utilization score returned by the pageserver becomes meaningful,
-        // schedule based on that instead of the shard count.
-        let node = nodes
-            .iter()
-            .map(|node_id| {
-                let may_schedule = self
-                    .nodes
-                    .get(node_id)
-                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
-                    .unwrap_or(false);
-                (*node_id, may_schedule)
-            })
-            .max_by_key(|(_n, may_schedule)| *may_schedule);
-
-        // If even the preferred node has may_schedule==false, return None
-        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
+    /// Calculate a single node's score, used in optimizer logic to compare specific
+    /// nodes' scores.
+    pub(crate) fn compute_node_score<Score>(
+        &mut self,
+        node_id: NodeId,
+        preferred_az: &Option<AvailabilityZone>,
+        context: &ScheduleContext,
+    ) -> Option<Score>
+    where
+        Score: NodeSchedulingScore,
+    {
+        self.nodes
+            .get_mut(&node_id)
+            .and_then(|node| Score::generate(&node_id, node, preferred_az, context))
     }
 
     /// Compute a schedulling score for each node that the scheduler knows of
@@ -727,7 +776,7 @@ impl Scheduler {
             tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
             scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
-        );
+       );
         }
 
         // Note that we do not update shard count here to reflect the scheduling: that
@@ -743,47 +792,74 @@ impl Scheduler {
     }
 
     /// For choosing which AZ to schedule a new shard into, use this.  It will return the
-    /// AZ with the lowest median utilization.
+    /// AZ with the the lowest number of shards currently scheduled in this AZ as their home
+    /// location.
     ///
     /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
     /// node, because while tenants start out single sharded, when they grow and undergo
-    /// shard-split, they will occupy space on many nodes within an AZ.
+    /// shard-split, they will occupy space on many nodes within an AZ.  It is important
+    /// that we pick the AZ in a way that balances this _future_ load.
     ///
-    /// We use median rather than total free space or mean utilization, because
-    /// we wish to avoid preferring AZs that have low-load nodes resulting from
-    /// recent replacements.
-    ///
-    /// The practical result is that we will pick an AZ based on its median node, and
-    /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
+    /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
+    /// nodes' utilization scores.
     pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
         if self.nodes.is_empty() {
             return None;
         }
 
-        let mut scores_by_az = HashMap::new();
-        for (node_id, node) in &self.nodes {
-            let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
-            let score = match &node.may_schedule {
-                MaySchedule::Yes(utilization) => utilization.score(),
-                MaySchedule::No => PageserverUtilization::full().score(),
-            };
-            az_scores.push((node_id, node, score));
+        #[derive(Default)]
+        struct AzScore {
+            home_shard_count: usize,
+            scheduleable: bool,
         }
 
-        // Sort by utilization.  Also include the node ID to break ties.
-        for scores in scores_by_az.values_mut() {
-            scores.sort_by_key(|i| (i.2, i.0));
+        let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
+        for node in self.nodes.values() {
+            let az = azs.entry(&node.az).or_default();
+            az.home_shard_count += node.home_shard_count;
+            az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
         }
 
-        let mut median_by_az = scores_by_az
+        // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
+        // all nodes are overloaded or otherwise unschedulable).
+        if azs.values().any(|i| i.scheduleable) {
+            azs.retain(|_, i| i.scheduleable);
+        }
+
+        // Find the AZ with the lowest number of shards currently allocated
+        Some(
+            azs.into_iter()
+                .min_by_key(|i| (i.1.home_shard_count, i.0))
+                .unwrap()
+                .0
+                .clone(),
+        )
+    }
+
+    pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
+        self.nodes.get(node_id).map(|n| n.az.clone())
+    }
+
+    /// For use when choosing a preferred secondary location: filter out nodes that are not
+    /// available, and gather their AZs.
+    pub(crate) fn filter_usable_nodes(
+        &self,
+        nodes: &[NodeId],
+    ) -> Vec<(NodeId, Option<AvailabilityZone>)> {
+        nodes
             .iter()
-            .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
-            .collect::<Vec<_>>();
-        // Sort by utilization.  Also include the AZ to break ties.
-        median_by_az.sort_by_key(|i| (i.1, i.0));
-
-        // Return the AZ with the lowest median utilization
-        Some(median_by_az.first().unwrap().0.clone())
+            .filter_map(|node_id| {
+                let node = self
+                    .nodes
+                    .get(node_id)
+                    .expect("Referenced nodes always exist");
+                if matches!(node.may_schedule, MaySchedule::Yes(_)) {
+                    Some((*node_id, Some(node.az.clone())))
+                } else {
+                    None
+                }
+            })
+            .collect()
     }
 
     /// Unit test access to internal state
@@ -843,7 +919,14 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
-    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+    use pageserver_api::{
+        controller_api::NodeAvailability, models::utilization::test_utilization,
+        shard::ShardIdentity,
+    };
+    use utils::{
+        id::TenantId,
+        shard::{ShardCount, ShardNumber, TenantShardId},
+    };
 
     use super::*;
 
@@ -853,8 +936,8 @@ mod tests {
         let nodes = test_utils::make_test_nodes(2, &[]);
 
         let mut scheduler = Scheduler::new(nodes.values());
-        let mut t1_intent = IntentState::new();
-        let mut t2_intent = IntentState::new();
+        let mut t1_intent = IntentState::new(None);
+        let mut t2_intent = IntentState::new(None);
 
         let context = ScheduleContext::default();
 
@@ -930,7 +1013,7 @@ mod tests {
             let scheduled = scheduler
                 .schedule_shard::<AttachedShardTag>(&[], &None, context)
                 .unwrap();
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(None);
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
             assert_eq!(scheduled, expect_node);
@@ -1063,7 +1146,7 @@ mod tests {
             let scheduled = scheduler
                 .schedule_shard::<Tag>(&[], &preferred_az, context)
                 .unwrap();
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(preferred_az.clone());
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
             assert_eq!(scheduled, expect_node);
@@ -1089,9 +1172,9 @@ mod tests {
             &mut context,
         );
 
-        // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
+        // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
         assert_scheduler_chooses::<AttachedShardTag>(
-            NodeId(2),
+            NodeId(1),
             Some(az_a_tag.clone()),
             &mut scheduled_intents,
             &mut scheduler,
@@ -1107,26 +1190,6 @@ mod tests {
             &mut context,
         );
 
-        // Avoid nodes in "az-b" for the secondary location.
-        // Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
-        assert_scheduler_chooses::<SecondaryShardTag>(
-            NodeId(1),
-            Some(az_b_tag.clone()),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &mut context,
-        );
-
-        // Avoid nodes in "az-b" for the secondary location.
-        // Node 3 has lower affinity score than 1, so prefer that.
-        assert_scheduler_chooses::<SecondaryShardTag>(
-            NodeId(3),
-            Some(az_b_tag.clone()),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &mut context,
-        );
-
         for mut intent in scheduled_intents {
             intent.clear(&mut scheduler);
         }
@@ -1150,34 +1213,292 @@ mod tests {
 
         let mut scheduler = Scheduler::new(nodes.values());
 
-        /// Force the utilization of a node in Scheduler's state to a particular
-        /// number of bytes used.
-        fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
-            let mut node = Node::new(
-                node_id,
-                "".to_string(),
-                0,
-                "".to_string(),
-                0,
-                scheduler.nodes.get(&node_id).unwrap().az.clone(),
-            );
-            node.set_availability(NodeAvailability::Active(test_utilization::simple(
-                shard_count,
-                0,
-            )));
-            scheduler.node_upsert(&node);
+        /// Force the `home_shard_count` of a node directly: this is the metric used
+        /// by the scheduler when picking AZs.
+        fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
+            let node = scheduler.nodes.get_mut(&node_id).unwrap();
+            node.home_shard_count = shard_count;
         }
 
         // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
 
-        // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
-        set_utilization(&mut scheduler, NodeId(1), 1000000);
-        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
-
-        // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
-        // should prefer the other AZ.
-        set_utilization(&mut scheduler, NodeId(2), 1000000);
+        // Home shard count is higher in AZ A, so AZ B will be preferred
+        set_shard_count(&mut scheduler, NodeId(1), 10);
         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
+
+        // Total home shard count is higher in AZ B, so we revert to preferring AZ A
+        set_shard_count(&mut scheduler, NodeId(4), 6);
+        set_shard_count(&mut scheduler, NodeId(5), 6);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+    }
+
+    /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
+    #[test]
+    fn az_selection_many() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let az_c_tag = AvailabilityZone("az-c".to_string());
+        let nodes = test_utils::make_test_nodes(
+            6,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_c_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_c_tag.clone(),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // We should get 1/6th of these on each node, give or take a few...
+        let total_tenants = 300;
+
+        // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
+        // on one AZ before correcting itself.  This is because we select the 'home' AZ based on
+        // an AZ-wide metric, but we select the location for secondaries on a purely node-based
+        // metric (while excluding the home AZ).
+        let grace = 3;
+
+        let mut scheduled_shards = Vec::new();
+        for _i in 0..total_tenants {
+            let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
+
+            let mut node_home_counts = scheduler
+                .nodes
+                .iter()
+                .map(|(node_id, node)| (node_id, node.home_shard_count))
+                .collect::<Vec<_>>();
+            node_home_counts.sort_by_key(|i| i.0);
+            eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
+
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::generate(),
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(1),
+            };
+
+            let shard_identity = ShardIdentity::new(
+                tenant_shard_id.shard_number,
+                tenant_shard_id.shard_count,
+                pageserver_api::shard::ShardStripeSize(1),
+            )
+            .unwrap();
+            let mut shard = TenantShard::new(
+                tenant_shard_id,
+                shard_identity,
+                pageserver_api::controller_api::PlacementPolicy::Attached(1),
+                Some(preferred_az),
+            );
+
+            let mut context = ScheduleContext::default();
+            shard.schedule(&mut scheduler, &mut context).unwrap();
+            eprintln!("Scheduled shard at {:?}", shard.intent);
+
+            scheduled_shards.push(shard);
+        }
+
+        for (node_id, node) in &scheduler.nodes {
+            eprintln!(
+                "Node {}: {} {} {}",
+                node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
+            );
+        }
+
+        for node in scheduler.nodes.values() {
+            assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
+        }
+
+        for mut shard in scheduled_shards {
+            shard.intent.clear(&mut scheduler);
+        }
+    }
+
+    #[test]
+    /// Make sure that when we have an odd number of nodes and an even number of shards, we still
+    /// get scheduling stability.
+    fn odd_nodes_stability() {
+        let az_a = AvailabilityZone("az-a".to_string());
+        let az_b = AvailabilityZone("az-b".to_string());
+
+        let nodes = test_utils::make_test_nodes(
+            10,
+            &[
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_shards = Vec::new();
+
+        let mut context = ScheduleContext::default();
+
+        fn schedule_shard(
+            tenant_shard_id: TenantShardId,
+            expect_attached: NodeId,
+            expect_secondary: NodeId,
+            scheduled_shards: &mut Vec<TenantShard>,
+            scheduler: &mut Scheduler,
+            preferred_az: Option<AvailabilityZone>,
+            context: &mut ScheduleContext,
+        ) {
+            let shard_identity = ShardIdentity::new(
+                tenant_shard_id.shard_number,
+                tenant_shard_id.shard_count,
+                pageserver_api::shard::ShardStripeSize(1),
+            )
+            .unwrap();
+            let mut shard = TenantShard::new(
+                tenant_shard_id,
+                shard_identity,
+                pageserver_api::controller_api::PlacementPolicy::Attached(1),
+                preferred_az,
+            );
+
+            shard.schedule(scheduler, context).unwrap();
+
+            assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
+            assert_eq!(
+                shard.intent.get_secondary().first().unwrap(),
+                &expect_secondary
+            );
+
+            scheduled_shards.push(shard);
+        }
+
+        let tenant_id = TenantId::generate();
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(8),
+            },
+            NodeId(1),
+            NodeId(6),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(1),
+                shard_count: ShardCount(8),
+            },
+            NodeId(2),
+            NodeId(7),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(2),
+                shard_count: ShardCount(8),
+            },
+            NodeId(3),
+            NodeId(8),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(3),
+                shard_count: ShardCount(8),
+            },
+            NodeId(4),
+            NodeId(9),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(4),
+                shard_count: ShardCount(8),
+            },
+            NodeId(5),
+            NodeId(10),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(5),
+                shard_count: ShardCount(8),
+            },
+            NodeId(1),
+            NodeId(6),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(6),
+                shard_count: ShardCount(8),
+            },
+            NodeId(2),
+            NodeId(7),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(7),
+                shard_count: ShardCount(8),
+            },
+            NodeId(3),
+            NodeId(8),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
+        for shard in &scheduled_shards {
+            assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
+        }
+
+        for mut shard in scheduled_shards {
+            shard.intent.clear(&mut scheduler);
+        }
     }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 8aa263f0c3..dadcc44cfb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1404,7 +1404,11 @@ impl Service {
 
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(
+                tsp.preferred_az_id
+                    .as_ref()
+                    .map(|az| AvailabilityZone(az.clone())),
+            );
             if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
             {
                 if nodes.contains_key(&generation_pageserver) {
@@ -2474,18 +2478,29 @@ impl Service {
         tenant_id: TenantId,
         _guard: &TracingExclusiveGuard<TenantOperations>,
     ) -> Result<(), ApiError> {
-        let present_in_memory = {
+        // Check if the tenant is present in memory, and select an AZ to use when loading
+        // if we will load it.
+        let load_in_az = {
             let locked = self.inner.read().unwrap();
-            locked
+            let existing = locked
                 .tenants
                 .range(TenantShardId::tenant_range(tenant_id))
-                .next()
-                .is_some()
-        };
+                .next();
 
-        if present_in_memory {
-            return Ok(());
-        }
+            // If the tenant is not present in memory, we expect to load it from database,
+            // so let's figure out what AZ to load it into while we have self.inner locked.
+            if existing.is_none() {
+                locked
+                    .scheduler
+                    .get_az_for_new_tenant()
+                    .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                        "No AZ with nodes found to load tenant"
+                    )))?
+            } else {
+                // We already have this tenant in memory
+                return Ok(());
+            }
+        };
 
         let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
         if tenant_shards.is_empty() {
@@ -2494,8 +2509,20 @@ impl Service {
             ));
         }
 
-        // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
-        // compute, so no benefit to making AZ sticky across detaches.
+        // Update the persistent shards with the AZ that we are about to apply to in-memory state
+        self.persistence
+            .set_tenant_shard_preferred_azs(
+                tenant_shards
+                    .iter()
+                    .map(|t| {
+                        (
+                            t.get_tenant_shard_id().expect("Corrupt shard in database"),
+                            load_in_az.clone(),
+                        )
+                    })
+                    .collect(),
+            )
+            .await?;
 
         let mut locked = self.inner.write().unwrap();
         tracing::info!(
@@ -2505,7 +2532,7 @@ impl Service {
         );
 
         locked.tenants.extend(tenant_shards.into_iter().map(|p| {
-            let intent = IntentState::new();
+            let intent = IntentState::new(Some(load_in_az.clone()));
             let shard =
                 TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
 
@@ -4236,6 +4263,22 @@ impl Service {
                 }
 
                 tracing::info!("Restoring parent shard {tenant_shard_id}");
+
+                // Drop any intents that refer to unavailable nodes, to enable this abort to proceed even
+                // if the original attachment location is offline.
+                if let Some(node_id) = shard.intent.get_attached() {
+                    if !nodes.get(node_id).unwrap().is_available() {
+                        tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
+                        shard.intent.demote_attached(scheduler, *node_id);
+                    }
+                }
+                for node_id in shard.intent.get_secondary().clone() {
+                    if !nodes.get(&node_id).unwrap().is_available() {
+                        tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
+                        shard.intent.remove_secondary(scheduler, node_id);
+                    }
+                }
+
                 shard.splitting = SplitState::Idle;
                 if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
                     // If this shard can't be scheduled now (perhaps due to offline nodes or
@@ -4389,15 +4432,13 @@ impl Service {
 
                     let mut child_state =
                         TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
-                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
+                    child_state.intent =
+                        IntentState::single(scheduler, Some(pageserver), preferred_az.clone());
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
-                    if let Some(preferred_az) = &preferred_az {
-                        child_state.set_preferred_az(preferred_az.clone());
-                    }
 
                     // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
@@ -5014,6 +5055,8 @@ impl Service {
                         // If our new attached node was a secondary, it no longer should be.
                         shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
                             if n > 0 {
@@ -5025,8 +5068,6 @@ impl Service {
                                 shard.intent.push_secondary(scheduler, old_attached);
                             }
                         }
-
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                     }
                     PlacementPolicy::Secondary => {
                         shard.intent.clear(scheduler);
@@ -5712,7 +5753,7 @@ impl Service {
             register_req.listen_http_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
-            register_req.availability_zone_id,
+            register_req.availability_zone_id.clone(),
         );
 
         // TODO: idempotency if the node already exists in the database
@@ -5732,8 +5773,9 @@ impl Service {
             .set(locked.nodes.len() as i64);
 
         tracing::info!(
-            "Registered pageserver {}, now have {} pageservers",
+            "Registered pageserver {} ({}), now have {} pageservers",
             register_req.node_id,
+            register_req.availability_zone_id,
             locked.nodes.len()
         );
         Ok(())
@@ -6467,6 +6509,7 @@ impl Service {
                 // Shard was dropped between planning and execution;
                 continue;
             };
+            tracing::info!("Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
                 if self.maybe_reconcile_shard(shard, nodes).is_some() {
@@ -6497,7 +6540,13 @@ impl Service {
 
         let mut work = Vec::new();
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        // We are going to plan a bunch of optimisations before applying any of them, so the
+        // utilisation stats on nodes will be effectively stale for the >1st optimisation we
+        // generate.  To avoid this causing unstable migrations/flapping, it's important that the
+        // code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`]
+        // to ignore the utilisation component of the score.
 
         for (_tenant_id, schedule_context, shards) in
             TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
@@ -6528,13 +6577,28 @@ impl Service {
                     continue;
                 }
 
-                // TODO: optimization calculations are relatively expensive: create some fast-path for
-                // the common idle case (avoiding the search on tenants that we have recently checked)
+                // Fast path: we may quickly identify shards that don't have any possible optimisations
+                if !shard.maybe_optimizable(scheduler, &schedule_context) {
+                    if cfg!(feature = "testing") {
+                        // Check that maybe_optimizable doesn't disagree with the actual optimization functions.
+                        // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
+                        // panic in prod if we hit this, or spend cycles on it in prod.
+                        assert!(shard
+                            .optimize_attachment(scheduler, &schedule_context)
+                            .is_none());
+                        assert!(shard
+                            .optimize_secondary(scheduler, &schedule_context)
+                            .is_none());
+                    }
+                    continue;
+                }
+
                 if let Some(optimization) =
-                    // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                    // If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to
                     // its primary location based on soft constraints, cut it over.
-                    shard.optimize_attachment(nodes, &schedule_context)
+                    shard.optimize_attachment(scheduler, &schedule_context)
                 {
+                    tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}");
                     work.push((shard.tenant_shard_id, optimization));
                     break;
                 } else if let Some(optimization) =
@@ -6544,6 +6608,7 @@ impl Service {
                     // in the same tenant with secondary locations on the node where they originally split.
                     shard.optimize_secondary(scheduler, &schedule_context)
                 {
+                    tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}");
                     work.push((shard.tenant_shard_id, optimization));
                     break;
                 }
@@ -6592,8 +6657,10 @@ impl Service {
                         }
                     }
                 }
-                ScheduleOptimizationAction::ReplaceSecondary(_) => {
-                    // No extra checks needed to replace a secondary: this does not interrupt client access
+                ScheduleOptimizationAction::ReplaceSecondary(_)
+                | ScheduleOptimizationAction::CreateSecondary(_)
+                | ScheduleOptimizationAction::RemoveSecondary(_) => {
+                    // No extra checks needed to manage secondaries: this does not interrupt client access
                     validated_work.push((tenant_shard_id, optimization))
                 }
             };
@@ -6665,26 +6732,35 @@ impl Service {
     /// we have this helper to move things along faster.
     #[cfg(feature = "testing")]
     async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
-        let (attached_node, secondary_node) = {
+        let (attached_node, secondaries) = {
             let locked = self.inner.read().unwrap();
             let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                tracing::warn!(
+                    "Skipping kick of secondary download for {tenant_shard_id}: not found"
+                );
                 return;
             };
-            let (Some(attached), Some(secondary)) = (
-                shard.intent.get_attached(),
-                shard.intent.get_secondary().first(),
-            ) else {
+
+            let Some(attached) = shard.intent.get_attached() else {
+                tracing::warn!(
+                    "Skipping kick of secondary download for {tenant_shard_id}: no attached"
+                );
                 return;
             };
-            (
-                locked.nodes.get(attached).unwrap().clone(),
-                locked.nodes.get(secondary).unwrap().clone(),
-            )
+
+            let secondaries = shard
+                .intent
+                .get_secondary()
+                .iter()
+                .map(|n| locked.nodes.get(n).unwrap().clone())
+                .collect::<Vec<_>>();
+
+            (locked.nodes.get(attached).unwrap().clone(), secondaries)
         };
 
         // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
         // a 'kick' to let scheduling optimisation run more promptly.
-        attached_node
+        match attached_node
             .with_client_retries(
                 |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
                 &self.config.jwt_token,
@@ -6693,22 +6769,57 @@ impl Service {
                 SHORT_RECONCILE_TIMEOUT,
                 &self.cancel,
             )
-            .await;
+            .await
+        {
+            Some(Err(e)) => {
+                tracing::info!(
+                    "Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}"
+                );
+            }
+            None => {
+                tracing::info!(
+                    "Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}"
+                );
+            }
+            Some(Ok(_)) => {
+                tracing::info!(
+                    "Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}"
+                );
+            }
+        }
 
-        secondary_node
-            .with_client_retries(
-                |client| async move {
-                    client
-                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
-                        .await
-                },
-                &self.config.jwt_token,
-                3,
-                10,
-                SHORT_RECONCILE_TIMEOUT,
-                &self.cancel,
-            )
-            .await;
+        for secondary_node in secondaries {
+            match secondary_node
+                .with_client_retries(
+                    |client| async move {
+                        client
+                            .tenant_secondary_download(
+                                tenant_shard_id,
+                                Some(Duration::from_secs(1)),
+                            )
+                            .await
+                    },
+                    &self.config.jwt_token,
+                    3,
+                    10,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Err(e)) => {
+                    tracing::info!(
+                "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
+            );
+                }
+                None => {
+                    tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
+                }
+                Some(Ok(progress)) => {
+                    tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
+                }
+            }
+        }
     }
 
     /// Look for shards which are oversized and in need of splitting
@@ -7144,9 +7255,15 @@ impl Service {
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
 
-        let mut tids_by_node = locked
-            .tenants
+        let node_az = nodes
+            .get(&node_id)
+            .expect("Node must exist")
+            .get_availability_zone_id()
+            .clone();
+
+        let mut tids_by_node = tenants
             .iter_mut()
             .filter_map(|(tid, tenant_shard)| {
                 if !matches!(
@@ -7159,6 +7276,25 @@ impl Service {
                     return None;
                 }
 
+                // AZ check: when filling nodes after a restart, our intent is to move _back_ the
+                // shards which belong on this node, not to promote shards whose scheduling preference
+                // would be on their currently attached node.  So will avoid promoting shards whose
+                // home AZ doesn't match the AZ of the node we're filling.
+                match tenant_shard.preferred_az() {
+                    None => {
+                        // Shard doesn't have an AZ preference: it is elegible to be moved.
+                    }
+                    Some(az) if az == &node_az => {
+                        // This shard's home AZ is equal to the node we're filling: it is
+                        // elegible to be moved: fall through;
+                    }
+                    Some(_) => {
+                        // This shard's home AZ is somewhere other than the node we're filling:
+                        // do not include it in the fill plan.
+                        return None;
+                    }
+                }
+
                 if tenant_shard.intent.get_secondary().contains(&node_id) {
                     if let Some(primary) = tenant_shard.intent.get_attached() {
                         return Some((*primary, *tid));
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
index d38010a27e..dd6913e988 100644
--- a/storage_controller/src/service/context_iterator.rs
+++ b/storage_controller/src/service/context_iterator.rs
@@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
 
             // Accumulate the schedule context for all the shards in a tenant
             schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
             tenant_shards.push(shard);
 
             if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
@@ -115,7 +112,7 @@ mod tests {
         assert_eq!(tenant_id, t1_id);
         assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
         assert_eq!(shards.len(), 1);
-        assert_eq!(context.attach_count(), 1);
+        assert_eq!(context.location_count(), 2);
 
         let (tenant_id, context, shards) = iter.next().unwrap();
         assert_eq!(tenant_id, t2_id);
@@ -124,13 +121,13 @@ mod tests {
         assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
         assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
         assert_eq!(shards.len(), 4);
-        assert_eq!(context.attach_count(), 4);
+        assert_eq!(context.location_count(), 8);
 
         let (tenant_id, context, shards) = iter.next().unwrap();
         assert_eq!(tenant_id, t3_id);
         assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
         assert_eq!(shards.len(), 1);
-        assert_eq!(context.attach_count(), 1);
+        assert_eq!(context.location_count(), 2);
 
         for shard in tenants.values_mut() {
             shard.intent.clear(&mut scheduler);
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e0a71b5822..2ba2a57eba 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -11,16 +11,14 @@ use crate::{
     persistence::TenantShardPersistence,
     reconciler::{ReconcileUnits, ReconcilerConfig},
     scheduler::{
-        AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
-        SecondaryShardTag,
+        AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore,
+        RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag,
     },
     service::ReconcileResultRequest,
 };
 use futures::future::{self, Either};
 use itertools::Itertools;
-use pageserver_api::controller_api::{
-    AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
-};
+use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -33,6 +31,7 @@ use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
+    shard::ShardCount,
     sync::gate::GateGuard,
 };
 
@@ -147,45 +146,67 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub(crate) struct IntentState {
+    attached: Option<NodeId>,
+    secondary: Vec<NodeId>,
 
     // We should attempt to schedule this shard in the provided AZ to
     // decrease chances of cross-AZ compute.
     preferred_az_id: Option<AvailabilityZone>,
 }
 
-#[derive(Default, Clone, Debug, Serialize)]
-pub(crate) struct IntentState {
-    attached: Option<NodeId>,
-    secondary: Vec<NodeId>,
-}
-
 impl IntentState {
-    pub(crate) fn new() -> Self {
+    pub(crate) fn new(preferred_az_id: Option<AvailabilityZone>) -> Self {
         Self {
             attached: None,
             secondary: vec![],
+            preferred_az_id,
         }
     }
-    pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
+    pub(crate) fn single(
+        scheduler: &mut Scheduler,
+        node_id: Option<NodeId>,
+        preferred_az_id: Option<AvailabilityZone>,
+    ) -> Self {
         if let Some(node_id) = node_id {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
+            scheduler.update_node_ref_counts(
+                node_id,
+                preferred_az_id.as_ref(),
+                RefCountUpdate::Attach,
+            );
         }
         Self {
             attached: node_id,
             secondary: vec![],
+            preferred_az_id,
         }
     }
 
     pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
         if self.attached != new_attached {
             if let Some(old_attached) = self.attached.take() {
-                scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
+                scheduler.update_node_ref_counts(
+                    old_attached,
+                    self.preferred_az_id.as_ref(),
+                    RefCountUpdate::Detach,
+                );
             }
             if let Some(new_attached) = &new_attached {
-                scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
+                scheduler.update_node_ref_counts(
+                    *new_attached,
+                    self.preferred_az_id.as_ref(),
+                    RefCountUpdate::Attach,
+                );
             }
             self.attached = new_attached;
         }
+
+        if let Some(new_attached) = &new_attached {
+            assert!(!self.secondary.contains(new_attached));
+        }
     }
 
     /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
@@ -204,15 +225,28 @@ impl IntentState {
         let demoted = self.attached;
         self.attached = Some(promote_secondary);
 
-        scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
+        scheduler.update_node_ref_counts(
+            promote_secondary,
+            self.preferred_az_id.as_ref(),
+            RefCountUpdate::PromoteSecondary,
+        );
         if let Some(demoted) = demoted {
-            scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
+            scheduler.update_node_ref_counts(
+                demoted,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::DemoteAttached,
+            );
         }
     }
 
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
-        debug_assert!(!self.secondary.contains(&new_secondary));
-        scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
+        assert!(!self.secondary.contains(&new_secondary));
+        assert!(self.attached != Some(new_secondary));
+        scheduler.update_node_ref_counts(
+            new_secondary,
+            self.preferred_az_id.as_ref(),
+            RefCountUpdate::AddSecondary,
+        );
         self.secondary.push(new_secondary);
     }
 
@@ -220,27 +254,43 @@ impl IntentState {
     pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
         let index = self.secondary.iter().position(|n| *n == node_id);
         if let Some(index) = index {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
             self.secondary.remove(index);
         }
     }
 
     pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
         for secondary in self.secondary.drain(..) {
-            scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                secondary,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
         }
     }
 
     /// Remove the last secondary node from the list of secondaries
     pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
         if let Some(node_id) = self.secondary.pop() {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
         }
     }
 
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
-            scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
+            scheduler.update_node_ref_counts(
+                old_attached,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::Detach,
+            );
         }
 
         self.clear_secondary(scheduler);
@@ -275,7 +325,11 @@ impl IntentState {
         if self.attached == Some(node_id) {
             self.attached = None;
             self.secondary.push(node_id);
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::DemoteAttached,
+            );
             true
         } else {
             false
@@ -315,6 +369,7 @@ pub(crate) struct ObservedStateLocation {
     /// we know that we might have some state on this node.
     pub(crate) conf: Option<LocationConfig>,
 }
+
 pub(crate) struct ReconcilerWaiter {
     // For observability purposes, remember the ID of the shard we're
     // waiting for.
@@ -360,6 +415,10 @@ pub(crate) enum ScheduleOptimizationAction {
     ReplaceSecondary(ReplaceSecondary),
     // Migrate attachment to an existing secondary location
     MigrateAttachment(MigrateAttachment),
+    // Create a secondary location, with the intent of later migrating to it
+    CreateSecondary(NodeId),
+    // Remove a secondary location that we previously created to facilitate a migration
+    RemoveSecondary(NodeId),
 }
 
 #[derive(Eq, PartialEq, Debug, Clone)]
@@ -486,7 +545,7 @@ impl TenantShard {
         Self {
             tenant_shard_id,
             policy,
-            intent: IntentState::default(),
+            intent: IntentState::new(preferred_az_id),
             generation: Some(Generation::new(0)),
             shard,
             observed: ObservedState::default(),
@@ -500,7 +559,6 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id,
         }
     }
 
@@ -563,7 +621,7 @@ impl TenantShard {
             return Ok((false, node_id));
         }
 
-        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
+        if let Some(promote_secondary) = self.preferred_secondary(scheduler) {
             // Promote a secondary
             tracing::debug!("Promoted secondary {} to attached", promote_secondary);
             self.intent.promote_attached(scheduler, promote_secondary);
@@ -572,7 +630,7 @@ impl TenantShard {
             // Pick a fresh node: either we had no secondaries or none were schedulable
             let node_id = scheduler.schedule_shard::<AttachedShardTag>(
                 &self.intent.secondary,
-                &self.preferred_az_id,
+                &self.intent.preferred_az_id,
                 context,
             )?;
             tracing::debug!("Selected {} as attached", node_id);
@@ -594,9 +652,6 @@ impl TenantShard {
         let r = self.do_schedule(scheduler, context);
 
         context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
 
         r
     }
@@ -631,24 +686,7 @@ impl TenantShard {
         use PlacementPolicy::*;
         match self.policy {
             Attached(secondary_count) => {
-                let retain_secondaries = if self.intent.attached.is_none()
-                    && scheduler.node_preferred(&self.intent.secondary).is_some()
-                {
-                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
-                    // one more secondary than we usually would, as one of them will become attached futher down this function.
-                    secondary_count + 1
-                } else {
-                    secondary_count
-                };
-
-                while self.intent.secondary.len() > retain_secondaries {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-
-                // Should have exactly one attached, and N secondaries
+                // Should have exactly one attached, and at least N secondaries
                 let (modified_attached, attached_node_id) =
                     self.schedule_attached(scheduler, context)?;
                 modified |= modified_attached;
@@ -657,7 +695,7 @@ impl TenantShard {
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
                         &used_pageservers,
-                        &self.preferred_az_id,
+                        &self.intent.preferred_az_id,
                         context,
                     )?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -674,7 +712,7 @@ impl TenantShard {
                     // Populate secondary by scheduling a fresh node
                     let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
                         &[],
-                        &self.preferred_az_id,
+                        &self.intent.preferred_az_id,
                         context,
                     )?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -718,7 +756,7 @@ impl TenantShard {
     ) -> Result<(), ScheduleError> {
         let promote_to = match promote_to {
             Some(node) => node,
-            None => match scheduler.node_preferred(self.intent.get_secondary()) {
+            None => match self.preferred_secondary(scheduler) {
                 Some(node) => node,
                 None => {
                     return Err(ScheduleError::ImpossibleConstraint);
@@ -745,90 +783,276 @@ impl TenantShard {
         Ok(())
     }
 
+    /// Returns None if the current location's score is unavailable, i.e. cannot draw a conclusion
+    fn is_better_location<T: ShardTag>(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+        current: NodeId,
+        candidate: NodeId,
+    ) -> Option<bool> {
+        let Some(candidate_score) = scheduler.compute_node_score::<T::Score>(
+            candidate,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) else {
+            // The candidate node is unavailable for scheduling or otherwise couldn't get a score
+            return None;
+        };
+
+        match scheduler.compute_node_score::<T::Score>(
+            current,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) {
+            Some(current_score) => {
+                // Ignore utilization components when comparing scores: we don't want to migrate
+                // because of transient load variations, it risks making the system thrash, and
+                // migrating for utilization requires a separate high level view of the system to
+                // e.g. prioritize moving larger or smaller tenants, rather than arbitrarily
+                // moving things around in the order that we hit this function.
+                let candidate_score = candidate_score.for_optimization();
+                let current_score = current_score.for_optimization();
+
+                if candidate_score < current_score {
+                    tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})");
+                    Some(true)
+                } else {
+                    // The candidate node is no better than our current location, so don't migrate
+                    tracing::debug!(
+                        "Candidate node {candidate} is no better than our current location {current} (candidate {candidate_score:?} vs current {current_score:?})",
+                    );
+                    Some(false)
+                }
+            }
+            None => {
+                // The current node is unavailable for scheduling, so we can't make any sensible
+                // decisions about optimisation.  This should be a transient state -- if the node
+                // is offline then it will get evacuated, if is blocked by a scheduling mode
+                // then we will respect that mode by doing nothing.
+                tracing::debug!("Current node {current} is unavailable for scheduling");
+                None
+            }
+        }
+    }
+
+    fn find_better_location<T: ShardTag>(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+        current: NodeId,
+        hard_exclude: &[NodeId],
+    ) -> Option<NodeId> {
+        // Look for a lower-scoring location to attach to
+        let Ok(candidate_node) = scheduler.schedule_shard::<T>(
+            hard_exclude,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) else {
+            // A scheduling error means we have no possible candidate replacements
+            tracing::debug!("No candidate node found");
+            return None;
+        };
+
+        if candidate_node == current {
+            // We're already at the best possible location, so don't migrate
+            tracing::debug!("Candidate node {candidate_node} is already in use");
+            return None;
+        }
+
+        self.is_better_location::<T>(scheduler, schedule_context, current, candidate_node)
+            .and_then(|better| if better { Some(candidate_node) } else { None })
+    }
+
+    /// This function is an optimization, used to avoid doing large numbers of scheduling operations
+    /// when looking for optimizations.  This function uses knowledge of how scores work to do some
+    /// fast checks for whether it may to be possible to improve a score.
+    ///
+    /// If we return true, it only means that optimization _might_ be possible, not that it necessarily is.  If we
+    /// return no, it definitely means that calling [`Self::optimize_attachment`] or [`Self::optimize_secondary`] would do no
+    /// work.
+    pub(crate) fn maybe_optimizable(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+    ) -> bool {
+        // Sharded tenant: check if any locations have a nonzero affinity score
+        if self.shard.count >= ShardCount(1) {
+            let schedule_context = schedule_context.project_detach(self);
+            for node in self.intent.all_pageservers() {
+                if let Some(af) = schedule_context.nodes.get(&node) {
+                    if *af > AffinityScore(0) {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        // Attached tenant: check if the attachment is outside the preferred AZ
+        if let PlacementPolicy::Attached(_) = self.policy {
+            if let Some(attached) = self.intent.get_attached() {
+                if scheduler.get_node_az(attached) != self.intent.preferred_az_id {
+                    return true;
+                }
+            }
+        }
+
+        // Tenant with secondary locations: check if any are within the preferred AZ
+        for secondary in self.intent.get_secondary() {
+            if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                return true;
+            }
+        }
+
+        // Does the tenant have excess secondaries?
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            return true;
+        }
+
+        // Fall through: no optimizations possible
+        false
+    }
+
     /// Optimize attachments: if a shard has a secondary location that is preferable to
     /// its primary location based on soft constraints, switch that secondary location
     /// to be attached.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn optimize_attachment(
         &self,
-        nodes: &HashMap<NodeId, Node>,
+        scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
         let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
 
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
+        let schedule_context = schedule_context.project_detach(self);
 
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                let node = nodes.get(node_id);
-                if node.is_none() {
-                    None
-                } else if matches!(
-                    node.unwrap().get_scheduling(),
-                    NodeSchedulingPolicy::Filling
-                ) {
-                    // If the node is currently filling, don't count it as a candidate to avoid,
-                    // racing with the background fill.
-                    None
-                } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization {
-                        sequence: self.sequence,
-                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                            old_attached_node_id: attached,
-                            new_attached_node_id: *preferred_node,
-                        }),
-                    });
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
+        // If we already have a secondary that is higher-scoring than out current location,
+        // then simply migrate to it.
+        for secondary in self.intent.get_secondary() {
+            if let Some(true) = self.is_better_location::<AttachedShardTag>(
+                scheduler,
+                &schedule_context,
+                attached,
+                *secondary,
+            ) {
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: *secondary,
+                    }),
+                });
             }
         }
 
-        // Fall-through: we didn't find an optimization
-        None
+        // Given that none of our current secondaries is a better location than our current
+        // attached location (checked above), we may trim any secondaries that are not needed
+        // for the placement policy.
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            // This code path cleans up extra secondaries after migrating, and/or
+            // trims extra secondaries after a PlacementPolicy::Attached(N) was
+            // modified to decrease N.
+
+            let secondary_scores = self
+                .intent
+                .get_secondary()
+                .iter()
+                .map(|node_id| {
+                    (
+                        *node_id,
+                        scheduler.compute_node_score::<NodeSecondarySchedulingScore>(
+                            *node_id,
+                            &self.intent.preferred_az_id,
+                            &schedule_context,
+                        ),
+                    )
+                })
+                .collect::<Vec<_>>();
+
+            if secondary_scores.iter().any(|score| score.1.is_none()) {
+                // Don't have full list of scores, so can't make a good decision about which to drop unless
+                // there is an obvious one in the wrong AZ
+                for secondary in self.intent.get_secondary() {
+                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                        return Some(ScheduleOptimization {
+                            sequence: self.sequence,
+                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                        });
+                    }
+                }
+
+                // Fall through: we didn't identify one to remove.  This ought to be rare.
+                tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
+                self.intent.get_secondary()
+            );
+            } else {
+                let victim = secondary_scores
+                    .iter()
+                    .max_by_key(|score| score.1.unwrap())
+                    .unwrap()
+                    .0;
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                });
+            }
+        }
+
+        let replacement = self.find_better_location::<AttachedShardTag>(
+            scheduler,
+            &schedule_context,
+            attached,
+            &[], // Don't exclude secondaries: our preferred attachment location may be a secondary
+        );
+
+        // We have found a candidate and confirmed that its score is preferable
+        // to our current location. See if we have a secondary location in the preferred location already: if not,
+        // then create one.
+        if let Some(replacement) = replacement {
+            // If we are currently in non-preferred AZ, then the scheduler might suggest a location that is better, but still
+            // not in our preferred AZ.  Migration has a cost in resources an impact to the workload, so we want to avoid doing
+            // multiple hops where we might go to some other AZ before eventually finding a suitable location in our preferred
+            // AZ: skip this optimization if it is not in our final, preferred AZ.
+            //
+            // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes
+            // there are too overloaded for scheduler to suggest them, more should be provisioned eventually).
+            if self.intent.preferred_az_id.is_some()
+                && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id
+            {
+                tracing::debug!(
+                    "Candidate node {replacement} is not in preferred AZ {:?}",
+                    self.intent.preferred_az_id
+                );
+
+                // This should only happen if our current location is not in the preferred AZ, otherwise
+                // [`Self::find_better_location`]` should have rejected any other location outside the preferred Az, because
+                // AZ is the highest priority part of NodeAttachmentSchedulingScore.
+                debug_assert!(scheduler.get_node_az(&attached) != self.intent.preferred_az_id);
+
+                return None;
+            }
+
+            if !self.intent.get_secondary().contains(&replacement) {
+                Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::CreateSecondary(replacement),
+                })
+            } else {
+                // We already have a secondary in the preferred location, let's try migrating to it.  Our caller
+                // will check the warmth of the destination before deciding whether to really execute this.
+                Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: replacement,
+                    }),
+                })
+            }
+        } else {
+            // We didn't find somewhere we'd rather be, and we don't have any excess secondaries
+            // to clean up: no action required.
+            None
+        }
     }
 
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
@@ -837,50 +1061,40 @@ impl TenantShard {
         scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            // We have extra secondaries, perhaps to facilitate a migration of the attached location:
+            // do nothing, it is up to [`Self::optimize_attachment`] to clean them up.  When that's done,
+            // and we are called again, we will proceed.
+            tracing::debug!("Too many secondaries: skipping");
             return None;
         }
 
+        let schedule_context = schedule_context.project_detach(self);
+
         for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
+            // Make sure we don't try to migrate a secondary to our attached location: this case happens
+            // easily in environments without multiple AZs.
+            let exclude = match self.intent.attached {
+                Some(attached) => vec![attached],
+                None => vec![],
             };
 
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
-                &self.intent.all_pageservers(),
-                &self.preferred_az_id,
-                schedule_context,
-            ) else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
+            let replacement = self.find_better_location::<SecondaryShardTag>(
+                scheduler,
+                &schedule_context,
+                *secondary,
+                &exclude,
+            );
+            assert!(replacement != Some(*secondary));
+            if let Some(replacement) = replacement {
+                // We have found a candidate and confirmed that its score is preferable
+                // to our current location. See if we have a secondary location in the preferred location already: if not,
+                // then create one.
                 return Some(ScheduleOptimization {
                     sequence: self.sequence,
                     action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                         old_node_id: *secondary,
-                        new_node_id: candidate_node,
+                        new_node_id: replacement,
                     }),
                 });
             }
@@ -921,11 +1135,54 @@ impl TenantShard {
                 self.intent.remove_secondary(scheduler, old_node_id);
                 self.intent.push_secondary(scheduler, new_node_id);
             }
+            ScheduleOptimizationAction::CreateSecondary(new_node_id) => {
+                self.intent.push_secondary(scheduler, new_node_id);
+            }
+            ScheduleOptimizationAction::RemoveSecondary(old_secondary) => {
+                self.intent.remove_secondary(scheduler, old_secondary);
+            }
         }
 
         true
     }
 
+    /// When a shard has several secondary locations, we need to pick one in situations where
+    /// we promote one of them to an attached location:
+    ///  - When draining a node for restart
+    ///  - When responding to a node failure
+    ///
+    /// In this context, 'preferred' does not mean the node with the best scheduling score: instead
+    /// we want to pick the node which is best for use _temporarily_ while the previous attached location
+    /// is unavailable (e.g. because it's down or deploying).  That means we prefer to use secondary
+    /// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary
+    /// secondary in the preferred AZ (which are usually only created for migrations, and if they exist
+    /// they're probably not warmed up yet). The latter behavior is based oni
+    ///
+    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
+    /// caller needs to a pick a node some other way.
+    pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option<NodeId> {
+        let candidates = scheduler.filter_usable_nodes(&self.intent.secondary);
+
+        // We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer
+        // to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ),
+        // rather than a short-lived secondary location being used for optimization/migration (which would have
+        // been scheduled in our preferred AZ).
+        let mut candidates = candidates
+            .iter()
+            .map(|(node_id, node_az)| {
+                if node_az == &self.intent.preferred_az_id {
+                    (1, *node_id)
+                } else {
+                    (0, *node_id)
+                }
+            })
+            .collect::<Vec<_>>();
+
+        candidates.sort();
+
+        candidates.first().map(|i| i.1)
+    }
+
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
     /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
     /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -1207,7 +1464,7 @@ impl TenantShard {
             detach,
             reconciler_config,
             config: self.config.clone(),
-            preferred_az: self.preferred_az_id.clone(),
+            preferred_az: self.intent.preferred_az_id.clone(),
             observed: self.observed.clone(),
             original_observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
@@ -1428,7 +1685,6 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-            preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone),
         })
     }
 
@@ -1444,16 +1700,16 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
-            preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()),
+            preferred_az_id: self.intent.preferred_az_id.as_ref().map(|az| az.0.clone()),
         }
     }
 
     pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> {
-        self.preferred_az_id.as_ref()
+        self.intent.preferred_az_id.as_ref()
     }
 
     pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
-        self.preferred_az_id = Some(preferred_az_id);
+        self.intent.preferred_az_id = Some(preferred_az_id);
     }
 
     /// Returns all the nodes to which this tenant shard is attached according to the
@@ -1756,65 +2012,90 @@ pub(crate) mod tests {
     }
 
     #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3, &[]);
+    /// Simple case: moving attachment to somewhere better where we already have a secondary
+    fn optimize_attachment_simple() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            3,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
         let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
 
         // Initially: both nodes attached on shard 1, and both have secondary locations
         // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(1));
         shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(2));
 
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+        fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context.avoid(&shard_b.intent.all_pageservers());
+            schedule_context
+        }
 
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
         assert_eq!(
             optimization_a,
             Some(ScheduleOptimization {
                 sequence: shard_a.sequence,
                 action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(2)
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
                 })
             })
         );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization {
-                sequence: shard_b.sequence,
-                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(3)
-                })
-            })
-        );
-
-        // Applying these optimizations should result in the end state proposed
         shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
+
+        // // Either shard should recognize that it has the option to switch to a secondary location where there
+        // // would be no other shards from the same tenant, and request to do so.
+        // assert_eq!(
+        //     optimization_a_prepare,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::CreateSecondary(NodeId(2))
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        // assert_eq!(
+        //     optimization_a_migrate,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+        //             old_attached_node_id: NodeId(1),
+        //             new_attached_node_id: NodeId(2)
+        //         })
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        // assert_eq!(
+        //     optimization_a_cleanup,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1))
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None);
 
         shard_a.intent.clear(&mut scheduler);
         shard_b.intent.clear(&mut scheduler);
@@ -1822,6 +2103,190 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    #[test]
+    /// Complicated case: moving attachment to somewhere better where we do not have a secondary
+    /// already, creating one as needed.
+    fn optimize_attachment_multistep() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            3,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Two shards of a tenant that wants to be in AZ A
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+
+        // Both shards are initially attached in non-home AZ _and_ have secondaries in non-home AZs
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(3)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(2));
+
+        fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context.avoid(&shard_b.intent.all_pageservers());
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(1))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
+                })
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    #[test]
+    /// Check that multi-step migration works when moving to somewhere that is only better by
+    /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary
+    /// counting toward the affinity score such that it prevents the rest of the migration from happening.
+    fn optimize_attachment_marginal() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(2, &[]);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Multi-sharded tenant, we will craft a situation where affinity
+        // scores differ only slightly
+        let mut shards = make_test_tenant(PlacementPolicy::Attached(0), ShardCount::new(4), None);
+
+        // 1 attached on node 1
+        shards[0]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(1)));
+        // 3 attached on node 2
+        shards[1]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+        shards[2]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+        shards[3]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+
+        // The scheduler should figure out that we need to:
+        // - Create a secondary for shard 3 on node 1
+        // - Migrate shard 3 to node 1
+        // - Remove shard 3's location on node 2
+
+        fn make_schedule_context(shards: &Vec<TenantShard>) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            for shard in shards {
+                schedule_context.avoid(&shard.intent.all_pageservers());
+            }
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_prepare =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(1))
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_migrate =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
+                })
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_cleanup =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // Everything should be stable now
+        let schedule_context = make_schedule_context(&shards);
+        assert_eq!(
+            shards[0].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[2].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[3].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+
+        for mut shard in shards {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
+
     #[test]
     fn optimize_secondary() -> anyhow::Result<()> {
         let nodes = make_test_nodes(4, &[]);
@@ -1839,9 +2304,7 @@ pub(crate) mod tests {
 
         let mut schedule_context = ScheduleContext::default();
         schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
         schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
 
         let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
 
@@ -1872,7 +2335,6 @@ pub(crate) mod tests {
     // called repeatedly in the background.
     // Returns the applied optimizations
     fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
         shards: &mut [TenantShard],
     ) -> Vec<ScheduleOptimization> {
@@ -1884,14 +2346,18 @@ pub(crate) mod tests {
 
             for shard in shards.iter() {
                 schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
             }
 
             for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
+                let optimization = shard.optimize_attachment(scheduler, &schedule_context);
+                tracing::info!(
+                    "optimize_attachment({})={:?}",
+                    shard.tenant_shard_id,
+                    optimization
+                );
                 if let Some(optimization) = optimization {
+                    // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist
+                    assert!(shard.maybe_optimizable(scheduler, &schedule_context));
                     optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
@@ -1899,7 +2365,15 @@ pub(crate) mod tests {
                 }
 
                 let optimization = shard.optimize_secondary(scheduler, &schedule_context);
+                tracing::info!(
+                    "optimize_secondary({})={:?}",
+                    shard.tenant_shard_id,
+                    optimization
+                );
                 if let Some(optimization) = optimization {
+                    // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist
+                    assert!(shard.maybe_optimizable(scheduler, &schedule_context));
+
                     optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
@@ -1923,14 +2397,34 @@ pub(crate) mod tests {
     /// that it converges.
     #[test]
     fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4, &[]);
+        let nodes = make_test_nodes(
+            9,
+            &[
+                // Initial 6 nodes
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+                AvailabilityZone("az-c".to_string()),
+                // Three we will add later
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
 
-        // Only show the scheduler a couple of nodes
+        // Only show the scheduler two nodes in each AZ to start with
         let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+        for i in 1..=6 {
+            scheduler.node_upsert(nodes.get(&NodeId(i)).unwrap());
+        }
 
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
+        let mut shards = make_test_tenant(
+            PlacementPolicy::Attached(1),
+            ShardCount::new(4),
+            Some(AvailabilityZone("az-a".to_string())),
+        );
         let mut schedule_context = ScheduleContext::default();
         for shard in &mut shards {
             assert!(shard
@@ -1938,30 +2432,50 @@ pub(crate) mod tests {
                 .is_ok());
         }
 
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        // Initial: attached locations land in the tenant's home AZ.
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
+        // Initial: secondary locations in a remote AZ
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0);
 
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        // Add another three nodes: we should see the shards spread out when their optimize
+        // methods are called
+        scheduler.node_upsert(nodes.get(&NodeId(7)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(8)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(9)).unwrap());
+        optimize_til_idle(&mut scheduler, &mut shards);
+
+        // We expect one attached location was moved to the new node in the tenant's home AZ
+        assert_eq!(scheduler.get_node_shard_count(NodeId(7)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(7)), 1);
+        // The original node has one less attached shard
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
 
+        // One of the original nodes still has two attachments, since there are an odd number of nodes
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
+        // None of our secondaries moved, since we already had enough nodes for those to be
+        // scheduled perfectly
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0);
 
         for shard in shards.iter_mut() {
             shard.intent.clear(&mut scheduler);
@@ -2001,10 +2515,10 @@ pub(crate) mod tests {
             shard.schedule(&mut scheduler, context).unwrap();
         }
 
-        let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
+        let applied_to_a = optimize_til_idle(&mut scheduler, &mut a);
         assert_eq!(applied_to_a, vec![]);
 
-        let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
+        let applied_to_b = optimize_til_idle(&mut scheduler, &mut b);
         assert_eq!(applied_to_b, vec![]);
 
         for shard in a.iter_mut().chain(b.iter_mut()) {
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index caa89955e3..76c3ad01a4 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import concurrent.futures
 import re
+import threading
 from pathlib import Path
 
 import pytest
@@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
         check_pgbench_output(out_path)
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+    stop_pump = threading.Event()
+
+    def pump_controller():
+        # Run a background loop to force the storage controller to run its
+        # background work faster than it otherwise would: this helps
+        # us:
+        #  A) to create a test that runs in a shorter time
+        #  B) to create a test that is more intensive by doing the shard migrations
+        #     after splits happen more rapidly.
+        while not stop_pump.is_set():
+            env.storage_controller.reconcile_all()
+            stop_pump.wait(0.1)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
         pgbench_futs = []
         for tenant_state in tenants.values():
             fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
@@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+        pump_fut = pgbench_threads.submit(pump_controller)
+
         pgbench_futs = []
         for tenant_state in tenants.values():
             fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
@@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+        stop_pump.set()
+        pump_fut.result()
+
     def assert_all_split():
         for tenant_id in tenants.keys():
             shards = tenant_get_shards(env, tenant_id)
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 49f41483ec..d45db28c78 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -13,11 +13,13 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PageserverAvailability,
     PageserverSchedulingPolicy,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pg_version import PgVersion
+from fixtures.utils import wait_until
 
 
 def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
@@ -85,8 +87,12 @@ def test_storage_controller_many_tenants(
     )
 
     AZS = ["alpha", "bravo", "charlie"]
+
+    def az_selector(node_id):
+        return f"az-{AZS[(node_id - 1) % len(AZS)]}"
+
     neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
-        {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
+        {"availability_zone": az_selector(ps_cfg["id"])}
     )
 
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
@@ -168,6 +174,31 @@ def test_storage_controller_many_tenants(
         log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
         assert rss < expect_memory_per_shard * total_shards
 
+    def assert_all_tenants_scheduled_in_home_az():
+        for tenant_id in tenant_ids:
+            desc = env.storage_controller.tenant_describe(tenant_id)
+            preferred_az = None
+            for shard in desc["shards"]:
+                # All shards in a tenant should have the same preferred AZ
+                if preferred_az is None:
+                    preferred_az = shard["preferred_az_id"]
+                else:
+                    assert preferred_az == shard["preferred_az_id"]
+
+                # Attachment should be in the preferred AZ
+                assert shard["preferred_az_id"] == az_selector(
+                    shard["node_attached"]
+                ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}"
+
+                # Secondary locations should not be in the preferred AZ
+                for node_secondary in shard["node_secondary"]:
+                    assert (
+                        shard["preferred_az_id"] != az_selector(node_secondary)
+                    ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}"
+
+                # There should only be one secondary location (i.e. no migrations in flight)
+                assert len(shard["node_secondary"]) == 1
+
     # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
     # permits, to ensure that we are exercising stressing that.
     api_concurrency = 135
@@ -242,6 +273,22 @@ def test_storage_controller_many_tenants(
             f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
         )
 
+    # Check initial scheduling
+    assert_all_tenants_scheduled_in_home_az()
+    az_attached_counts: defaultdict[str, int] = defaultdict(int)
+    az_secondary_counts: defaultdict[str, int] = defaultdict(int)
+    node_attached_counts: defaultdict[str, int] = defaultdict(int)
+    for tenant_id in tenants.keys():
+        desc = env.storage_controller.tenant_describe(tenant_id)
+        for shard in desc["shards"]:
+            az_attached_counts[az_selector(shard["node_attached"])] += 1
+            node_attached_counts[shard["node_attached"]] += 1
+            for node_secondary in shard["node_secondary"]:
+                az_secondary_counts[az_selector(node_secondary)] += 1
+
+    log.info(f"Initial node attached counts: {node_attached_counts}")
+    log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}")
+
     # Plan operations: ensure each tenant with a timeline gets at least
     # one of each operation type.  Then add other tenants to make up the
     # numbers.
@@ -450,11 +497,77 @@ def test_storage_controller_many_tenants(
         env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
         env.storage_controller.consistency_check()
 
+    # Since we did `reconcile_until_idle` during the above loop, the system should be left in
+    # an optimally scheduled state.  Validate that this includes all the tenants being scheduled
+    # in their home AZ.
+    assert_all_tenants_scheduled_in_home_az()
+
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
     # as they were not offline long enough to trigger any scheduling changes.
     env.storage_controller.consistency_check()
     check_memory()
 
+    # Simulate loss of an AZ
+    victim_az = "az-alpha"
+    killed_pageservers = []
+    for ps in env.pageservers:
+        if az_selector(ps.id) == victim_az:
+            ps.stop(immediate=True)
+            killed_pageservers.append(ps)
+            log.info(f"Killed pageserver {ps.id}")
+
+    assert killed_pageservers
+
+    # Wait for the controller to notice the pageservers are dead
+    def assert_pageservers_availability(
+        pageservers: list[NeonPageserver], expected_availability: PageserverAvailability
+    ):
+        nodes = env.storage_controller.nodes()
+        checked_any = False
+        node_ids = [ps.id for ps in pageservers]
+        for node in nodes:
+            if node["id"] in node_ids:
+                checked_any = True
+                assert (
+                    node["availability"] == expected_availability
+                ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}"
+
+        assert checked_any
+
+    wait_until(
+        lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE),
+        timeout=60,
+    )
+
+    # Let the controller finish all its rescheduling
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
+    # Check that all the tenants are rescheduled to the remaining pageservers
+    for tenant_id in tenant_ids:
+        desc = env.storage_controller.tenant_describe(tenant_id)
+        for shard in desc["shards"]:
+            # Attachment should be outside the AZ where we killed the pageservers
+            assert (
+                az_selector(shard["node_attached"]) != victim_az
+            ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})"
+
+    # Bring back the pageservers
+    for ps in killed_pageservers:
+        ps.start()
+
+    wait_until(
+        lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE),
+        timeout=60,
+    )
+
+    # A very long timeout is required: we will be migrating all the tenants on all the pageservers
+    # in the region that we just restored.  Assume it'll take up to twice as long as it took to fill
+    # a single node
+    env.storage_controller.reconcile_until_idle(
+        max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4
+    )
+    assert_all_tenants_scheduled_in_home_az()
+
     # Stop the storage controller before tearing down fixtures, because it otherwise might log
     # errors trying to call our `ComputeReconfigure`.
     env.storage_controller.stop()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 673904a1cd..86a6b7428b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -520,14 +520,18 @@ def test_sharding_split_smoke(
     shard_count = 2
     # Shard count we split into
     split_shard_count = 4
-    # We will have 2 shards per pageserver once done (including secondaries)
-    neon_env_builder.num_pageservers = split_shard_count
+    # In preferred AZ & other AZ we will end up with one shard per pageserver
+    neon_env_builder.num_pageservers = split_shard_count * 2
 
     # Two AZs
     def assign_az(ps_cfg):
         az = f"az-{(ps_cfg['id'] - 1) % 2}"
         ps_cfg["availability_zone"] = az
 
+        # We will run more pageservers than tests usually do, so give them tiny page caches
+        # in case we're on a test node under memory pressure.
+        ps_cfg["page_cache_size"] = 128
+
     neon_env_builder.pageserver_config_override = assign_az
 
     # 1MiB stripes: enable getting some meaningful data distribution without
@@ -679,8 +683,8 @@ def test_sharding_split_smoke(
     # - shard_count reconciles for the original setup of the tenant
     # - shard_count reconciles for detaching the original secondary locations during split
     # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move)
-    expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2
+    # - split_shard_count/2 reconciles to migrate shards to their temporary secondaries
+    expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2)
 
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -745,10 +749,14 @@ def test_sharding_split_smoke(
     # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
-        1: 2,
-        2: 2,
-        3: 2,
-        4: 2,
+        1: 1,
+        2: 1,
+        3: 1,
+        4: 1,
+        5: 1,
+        6: 1,
+        7: 1,
+        8: 1,
     }
 
     # The controller is not required to lay out the attached locations in any particular way, but
@@ -1387,13 +1395,7 @@ def test_sharding_split_failures(
                 else:
                     attached_count += 1
 
-        if exclude_ps_id is not None:
-            # For a node failure case, we expect there to be a secondary location
-            # scheduled on the offline node, so expect one fewer secondary in total
-            assert secondary_count == initial_shard_count - 1
-        else:
-            assert secondary_count == initial_shard_count
-
+        assert secondary_count == initial_shard_count
         assert attached_count == initial_shard_count
 
     def assert_split_done(exclude_ps_id: int | None = None) -> None:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 3a55e75589..8ffb6ba6b2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3213,11 +3213,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     def assign_az(ps_cfg):
-        az = f"az-{ps_cfg['id']}"
+        az = f"az-{ps_cfg['id'] % 2}"
+        log.info("Assigned AZ {az}")
         ps_cfg["availability_zone"] = az
 
     neon_env_builder.pageserver_config_override = assign_az
-    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_pageservers = 4
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -3232,8 +3233,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
 
         assert shards[0]["preferred_az_id"] == expected_az
 
+    # When all other schedule scoring parameters are equal, tenants should round-robin on AZs
+    assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0"
+    assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1"
+    assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0"
+
+    # Try modifying preferred AZ
     updated = env.storage_controller.set_preferred_azs(
-        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+        {TenantShardId(tid, 0, 0): "az-0" for tid in tids}
     )
 
     assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
@@ -3241,29 +3248,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     for tid in tids:
         shards = env.storage_controller.tenant_describe(tid)["shards"]
         assert len(shards) == 1
-        assert shards[0]["preferred_az_id"] == "foo"
+        assert shards[0]["preferred_az_id"] == "az-0"
 
-    # Generate a layer to avoid shard split handling on ps from tripping
-    # up on debug assert.
-    timeline_id = TimelineId.generate()
-    env.create_timeline("bar", tids[0], timeline_id)
-
-    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
-    workload.init()
-    workload.write_rows(256)
-    workload.validate()
+    # Having modified preferred AZ, we should get moved there
+    env.storage_controller.reconcile_until_idle(max_interval=0.1)
+    for tid in tids:
+        shard = env.storage_controller.tenant_describe(tid)["shards"][0]
+        attached_to = shard["node_attached"]
+        attached_in_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == attached_in_az == "az-0"
 
     env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    env.storage_controller.reconcile_until_idle(max_interval=0.1)
     shards = env.storage_controller.tenant_describe(tids[0])["shards"]
     assert len(shards) == 2
     for shard in shards:
         attached_to = shard["node_attached"]
-        expected_az = env.get_pageserver(attached_to).az_id
-
-        # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
-        # in putting the tenant shards in the preferred AZ.
-        # To be fixed in https://github.com/neondatabase/neon/pull/9916
-        # assert shard["preferred_az_id"] == expected_az
+        attached_in_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == attached_in_az == "az-0"
 
 
 @run_only_on_default_postgres("Postgres version makes no difference here")

From 1783501eaa12519cd94c174114d2e79381ffee2d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 13 Jan 2025 22:01:03 +0200
Subject: [PATCH 22/32] Increase max connection for replica to prevent test
 flukyness (#10306)

## Problem

See https://github.com/neondatabase/neon/issues/10167
Too small number of `max_connections` (2) can cause failures of
test_physical_replication_config_mismatch_too_many_known_xids test

## Summary of changes

Increase `max_connections` to 5

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_physical_replication.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 6cb11b825d..17819fd367 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en
         origin=primary,
         endpoint_id="secondary",
         config_lines=[
-            "max_connections=2",
+            "max_connections=5",
             "autovacuum_max_workers=1",
             "max_worker_processes=5",
             "max_wal_senders=1",

From 430b556b3472007341f57af57ebdbafb67e6dc85 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 13 Jan 2025 18:44:39 -0600
Subject: [PATCH 23/32] Update postgres-exporter and sql_exporter in computes
 (#10349)

The postgres-exporter was much further out of date, but let's just bump
both.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      | 2 +-
 compute/compute-node.Dockerfile             | 4 ++--
 test_runner/regress/test_compute_metrics.py | 9 +++------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index cf6439d004..7a2ec9c43e 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \
 
 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.16.0
+ENV SQL_EXPORTER_VERSION=0.17.0
 RUN curl -fsSL \
     "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
     --output sql_exporter.tar.gz \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 2d38796d77..89cee6761f 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1274,11 +1274,11 @@ RUN set -e \
 #
 #########################################################################################
 
-FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 71963355b7..5dcc93acff 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -219,7 +219,7 @@ if SQL_EXPORTER is None:
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host")
 
             self.__logs_dir = logs_dir
             self.__port = port
@@ -252,7 +252,7 @@ if SQL_EXPORTER is None:
             log.info("Waiting for sql_exporter to be ready")
             wait_for_logs(
                 self,
-                rf'level=info msg="Listening on" address=\[::\]:{self.__port}',
+                rf'msg="Listening on" address=\[::\]:{self.__port}',
                 timeout=5,
             )
 
@@ -344,10 +344,7 @@ else:
                         time.sleep(0.5)
                         continue
 
-                    if (
-                        f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}'
-                        in line
-                    ):
+                    if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line:
                         break
 
         @override

From a039f8381f83547dec0d774b3598b8e4b75abf3a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 14 Jan 2025 07:54:30 +0200
Subject: [PATCH 24/32] Optimize vector get last written LSN (#10360)

## Problem

See https://github.com/neondatabase/neon/issues/10281

pg17 performs extra lock/unlock operation when fetching LwLSN.

## Summary of changes

Perform all lookups under one lock, moving initialization of not found
keys to separate loop.

Related Postgres PR:
https://github.com/neondatabase/postgres/pull/553

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 9c9e9a78a9..0f8da73ed0 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 9c9e9a78a93aebec2f6a2f54644442d35ffa245c
+Subproject commit 0f8da73ed08d4fc4ee58cccea008c75bfb20baa8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d182b88008..b4d57ab709 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "9c9e9a78a93aebec2f6a2f54644442d35ffa245c"
+    "0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
   ],
   "v16": [
     "16.6",

From df4abd8b14ff823f060f66acde34713da013d062 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 14 Jan 2025 12:53:32 +0000
Subject: [PATCH 25/32] fix: force-refresh azure identity token (#10378)

## Problem

Because of https://github.com/Azure/azure-sdk-for-rust/issues/1739, our
identity token file was not being refreshed. This caused our uploads to
start failing when the storage token expired.

## Summary of changes

Drop and recreate the remote storage config every time we upload in
order to force reload the identity token file.
---
 proxy/src/context/parquet.rs | 73 +++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 5f65b17374..d7ffff0483 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -187,10 +187,6 @@ pub async fn worker(
     let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
     let rx = rx.map(RequestData::from);
 
-    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
-        .await
-        .context("remote storage init")?;
-
     let properties = WriterProperties::builder()
         .set_data_page_size_limit(config.parquet_upload_page_size)
         .set_compression(config.parquet_upload_compression);
@@ -224,18 +220,18 @@ pub async fn worker(
         let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
         let rx_disconnect = rx_disconnect.map(RequestData::from);
 
-        let storage_disconnect =
-            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
-                .await
-                .context("remote storage for disconnect events init")?;
         let parquet_config_disconnect = parquet_config.clone();
         tokio::try_join!(
-            worker_inner(storage, rx, parquet_config),
-            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+            worker_inner(remote_storage_config, rx, parquet_config),
+            worker_inner(
+                disconnect_events_storage_config,
+                rx_disconnect,
+                parquet_config_disconnect
+            )
         )
         .map(|_| ())
     } else {
-        worker_inner(storage, rx, parquet_config).await
+        worker_inner(remote_storage_config, rx, parquet_config).await
     }
 }
 
@@ -251,18 +247,32 @@ struct ParquetConfig {
     test_remote_failures: u64,
 }
 
+impl ParquetConfig {
+    async fn storage(
+        &self,
+        storage_config: &RemoteStorageConfig,
+    ) -> anyhow::Result<GenericRemoteStorage> {
+        let storage = GenericRemoteStorage::from_config(storage_config)
+            .await
+            .context("remote storage init")?;
+
+        #[cfg(any(test, feature = "testing"))]
+        if self.test_remote_failures > 0 {
+            return Ok(GenericRemoteStorage::unreliable_wrapper(
+                storage,
+                self.test_remote_failures,
+            ));
+        }
+
+        Ok(storage)
+    }
+}
+
 async fn worker_inner(
-    storage: GenericRemoteStorage,
+    storage_config: RemoteStorageConfig,
     rx: impl Stream<Item = RequestData>,
     config: ParquetConfig,
 ) -> anyhow::Result<()> {
-    #[cfg(any(test, feature = "testing"))]
-    let storage = if config.test_remote_failures > 0 {
-        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
-    } else {
-        storage
-    };
-
     let mut rx = std::pin::pin!(rx);
 
     let mut rows = Vec::with_capacity(config.rows_per_group);
@@ -285,7 +295,7 @@ async fn worker_inner(
         }
         if len > config.file_size || force {
             last_upload = time::Instant::now();
-            let file = upload_parquet(w, len, &storage).await?;
+            let file = upload_parquet(w, len, &storage_config, &config).await?;
             w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
             len = 0;
         }
@@ -298,7 +308,7 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
+        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
     }
 
     Ok(())
@@ -340,7 +350,8 @@ where
 async fn upload_parquet(
     mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
-    storage: &GenericRemoteStorage,
+    storage_config: &RemoteStorageConfig,
+    config: &ParquetConfig,
 ) -> anyhow::Result<Writer<BytesMut>> {
     let len_uncompressed = w
         .flushed_row_groups()
@@ -377,6 +388,15 @@ async fn upload_parquet(
         size, compression, "uploading request parquet file"
     );
 
+    // A bug in azure-sdk means that the identity-token-file that expires after
+    // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
+    // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
+    // the storage token, but the identity token has now expired.
+    // <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
+    //
+    // To work around this, we recreate the storage every time.
+    let storage = config.storage(storage_config).await?;
+
     let year = now.year();
     let month = now.month();
     let day = now.day();
@@ -431,8 +451,8 @@ mod tests {
     use rand::rngs::StdRng;
     use rand::{Rng, SeedableRng};
     use remote_storage::{
-        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
-        DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+        RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+        DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
     };
     use tokio::sync::mpsc;
     use tokio::time;
@@ -559,12 +579,11 @@ mod tests {
             timeout: std::time::Duration::from_secs(120),
             small_timeout: std::time::Duration::from_secs(30),
         };
-        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+
+        worker_inner(remote_storage_config, rx, config)
             .await
             .unwrap();
 
-        worker_inner(storage, rx, config).await.unwrap();
-
         let mut files = WalkDir::new(tmpdir.as_std_path())
             .into_iter()
             .filter_map(|entry| entry.ok())

From 9bdb14c1c0b3ef1082ce68f1c54d4547393da362 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:27:48 -0500
Subject: [PATCH 26/32] fix(pageserver): ensure initial image layers have
 correct key ranges (#10374)

## Problem

Discovered during the relation dir refactor work.

If we do not create images as in this patch, we would get two set of
image layers:

```
0000...METADATA_KEYS
0000...REL_KEYS
```

They overlap at the same LSN and would cause data loss for relation
keys. This doesn't happen in prod because initial image layer generation
is never called, but better to be fixed to avoid future issues with the
reldir refactors.

## Summary of changes

* Consolidate create_image_layers call into a single one.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 43 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f7227efeba..741b214a73 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3781,36 +3781,35 @@ impl Timeline {
                 return Err(FlushLayerError::Cancelled);
             }
 
-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
+            // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
+            // So that the key ranges don't overlap.
+            let mut partitions = KeyPartitioning::default();
+            partitions.parts.extend(rel_partition.parts);
             if !metadata_partition.parts.is_empty() {
                 assert_eq!(
                     metadata_partition.parts.len(),
                     1,
                     "currently sparse keyspace should only contain a single metadata keyspace"
                 );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
+                // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                // every single key within the keyspace, and therefore, it's safe to force converting it
+                // into a dense keyspace before calling this function.
+                partitions
+                    .parts
+                    .extend(metadata_partition.into_dense().parts);
             }
 
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &partitions,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
             (layers_to_upload, None)
         } else {
             // Normal case, write out a L0 delta layer file.

From 2466a2f97763aff5b84253a242909fc19bc995a2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 14 Jan 2025 16:28:01 +0100
Subject: [PATCH 27/32] page_service: throttle individual requests instead of
 the batched request (#10353)

## Problem

Before this PR, the pagestream throttle was applied weighted on a
per-batch basis.
This had several problems:

1. The throttle occurence counters were only bumped by `1` instead of
`batch_size`.
2. The throttle wait time aggregator metric only counted one wait time,
irrespective
of `batch_size`. That makes sense in some ways of looking at it but not
in others.
3. If the last request in the batch runs into the throttle, the other
requests in the
batch are also throttled, i.e., over-throttling happens (theoretical,
didn't measure
   it in practice).

## Solution

It occured to me that we can simply push the throttling upwards into
`pagestream_read_message`.

This has the added benefit that in pipeline mode, the `executor` stage
will, if it is idle,
steal whatever requests already made it into the `spsc_fold` and execute
them; before this
change, that was not the case - the throttling happened in the
`executor` stage instead of
the `batcher` stage.

## Code Changes

There are two changes in this PR:

1. Lifting up the throttling into the `pagestream_read_message` method.
2. Move the throttling metrics out of the `Throttle` type into
`SmgrOpMetrics`.
Unlike the other smgr metrics, throttling is per-tenant, hence the Arc.
3. Refactor the `SmgrOpTimer` implementation to account for the new
observation states,
   and simplify its design.
4. Drive-by-fix flush time metrics. It was using the same `now` in the
`observe_guard` every time.

The `SmgrOpTimer` is now a state machine.
Each observation point moves the state machine forward.
If a timer object is dropped early some "pair"-like metrics still
require an increment or observation.
That's done in the Drop implementation, by driving the state machine to
completion.
---
 pageserver/src/metrics.rs                | 367 +++++++++++------------
 pageserver/src/page_service.rs           | 139 +++++----
 pageserver/src/tenant.rs                 |  14 +-
 pageserver/src/tenant/throttle.rs        |  45 +--
 pageserver/src/tenant/timeline.rs        |   8 +-
 pageserver/src/tenant/timeline/delete.rs |   1 +
 6 files changed, 284 insertions(+), 290 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5b8419fda9..5b1cbbad63 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1224,117 +1224,189 @@ pub(crate) struct SmgrOpTimerInner {
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
 
+    throttling: Arc<tenant_throttling::Pagestream>,
+
     timings: SmgrOpTimerState,
 }
 
+/// The stages of request processing are represented by the enum variants.
+/// Used as part of [`SmgrOpTimerInner::timings`].
+///
+/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
+/// transition points.
+/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
+/// to the next state.
+///
+/// Each request goes through every stage, in all configurations.
+///
 #[derive(Debug)]
 enum SmgrOpTimerState {
     Received {
+        // In the future, we may want to track the full time the request spent
+        // inside pageserver process (time spent in kernel buffers can't be tracked).
+        // `received_at` would be used for that.
+        #[allow(dead_code)]
         received_at: Instant,
     },
-    ThrottleDoneExecutionStarting {
-        received_at: Instant,
+    Throttling {
         throttle_started_at: Instant,
-        started_execution_at: Instant,
     },
+    Batching {
+        throttle_done_at: Instant,
+    },
+    Executing {
+        execution_started_at: Instant,
+    },
+    Flushing,
+    // NB: when adding observation points, remember to update the Drop impl.
 }
 
+// NB: when adding observation points, remember to update the Drop impl.
+impl SmgrOpTimer {
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
+            return;
+        };
+        inner.throttling.count_accounted_start.inc();
+        inner.timings = SmgrOpTimerState::Throttling {
+            throttle_started_at: at,
+        };
+    }
+
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Throttling {
+            throttle_started_at,
+        } = &inner.timings
+        else {
+            return;
+        };
+        inner.throttling.count_accounted_finish.inc();
+        match throttle {
+            ThrottleResult::NotThrottled { end } => {
+                inner.timings = SmgrOpTimerState::Batching {
+                    throttle_done_at: end,
+                };
+            }
+            ThrottleResult::Throttled { end } => {
+                // update metrics
+                inner.throttling.count_throttled.inc();
+                inner
+                    .throttling
+                    .wait_time
+                    .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
+                // state transition
+                inner.timings = SmgrOpTimerState::Batching {
+                    throttle_done_at: end,
+                };
+            }
+        }
+    }
+
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_execution_start(&mut self, at: Instant) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
+            return;
+        };
+        // update metrics
+        let batch = at - *throttle_done_at;
+        inner.global_batch_wait_time.observe(batch.as_secs_f64());
+        inner
+            .per_timeline_batch_wait_time
+            .observe(batch.as_secs_f64());
+        // state transition
+        inner.timings = SmgrOpTimerState::Executing {
+            execution_started_at: at,
+        }
+    }
+
+    /// For all but the first caller, this is a no-op.
+    /// The first callers receives Some, subsequent ones None.
+    ///
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_execution_end_flush_start(
+        &mut self,
+        at: Instant,
+    ) -> Option<SmgrOpFlushInProgress> {
+        // NB: unlike the other observe_* methods, this one take()s.
+        #[allow(clippy::question_mark)] // maintain similar code pattern.
+        let Some(mut inner) = self.0.take() else {
+            return None;
+        };
+        let SmgrOpTimerState::Executing {
+            execution_started_at,
+        } = &inner.timings
+        else {
+            return None;
+        };
+        // update metrics
+        let execution = at - *execution_started_at;
+        inner
+            .global_execution_latency_histo
+            .observe(execution.as_secs_f64());
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
+        }
+
+        // state transition
+        inner.timings = SmgrOpTimerState::Flushing;
+
+        // return the flush in progress object which
+        // will do the remaining metrics updates
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        Some(SmgrOpFlushInProgress {
+            flush_started_at: at,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        })
+    }
+}
+
+/// The last stage of request processing is serializing and flushing the request
+/// into the TCP connection. We want to make slow flushes observable
+/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
+/// to periodically bump the metric.
+///
+/// If in the future we decide that we're not interested in live updates, we can
+/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
+/// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
     flush_started_at: Instant,
     global_micros: IntCounter,
     per_timeline_micros: IntCounter,
 }
 
-impl SmgrOpTimer {
-    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
-        let inner = self.0.as_mut().expect("other public methods consume self");
-        match (&mut inner.timings, throttle) {
-            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
-                ThrottleResult::NotThrottled { start } => {
-                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                        received_at: *received_at,
-                        throttle_started_at: *start,
-                        started_execution_at: *start,
-                    };
-                }
-                ThrottleResult::Throttled { start, end } => {
-                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                        received_at: *start,
-                        throttle_started_at: *start,
-                        started_execution_at: *end,
-                    };
-                }
-            },
-            (x, _) => panic!("called in unexpected state: {x:?}"),
-        }
-    }
-
-    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
-        let (flush_start, inner) = self
-            .smgr_op_end()
-            .expect("this method consume self, and the only other caller is drop handler");
-        let SmgrOpTimerInner {
-            global_flush_in_progress_micros,
-            per_timeline_flush_in_progress_micros,
-            ..
-        } = inner;
-        SmgrOpFlushInProgress {
-            flush_started_at: flush_start,
-            global_micros: global_flush_in_progress_micros,
-            per_timeline_micros: per_timeline_flush_in_progress_micros,
-        }
-    }
-
-    /// Returns `None`` if this method has already been called, `Some` otherwise.
-    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
-        let inner = self.0.take()?;
-
-        let now = Instant::now();
-
-        let batch;
-        let execution;
-        let throttle;
-        match inner.timings {
-            SmgrOpTimerState::Received { received_at } => {
-                batch = (now - received_at).as_secs_f64();
-                // TODO: use label for dropped requests.
-                // This is quite rare in practice, only during tenant/pageservers shutdown.
-                throttle = Duration::ZERO;
-                execution = Duration::ZERO.as_secs_f64();
-            }
-            SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                received_at,
-                throttle_started_at,
-                started_execution_at,
-            } => {
-                batch = (throttle_started_at - received_at).as_secs_f64();
-                throttle = started_execution_at - throttle_started_at;
-                execution = (now - started_execution_at).as_secs_f64();
-            }
-        }
-
-        // update time spent in batching
-        inner.global_batch_wait_time.observe(batch);
-        inner.per_timeline_batch_wait_time.observe(batch);
-
-        // time spent in throttle metric is updated by throttle impl
-        let _ = throttle;
-
-        // update metrics for execution latency
-        inner.global_execution_latency_histo.observe(execution);
-        if let Some(per_timeline_execution_latency_histo) =
-            &inner.per_timeline_execution_latency_histo
-        {
-            per_timeline_execution_latency_histo.observe(execution);
-        }
-
-        Some((now, inner))
-    }
-}
-
 impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        self.smgr_op_end();
+        // In case of early drop, update any of the remaining metrics with
+        // observations so that (started,finished) counter pairs balance out
+        // and all counters on the latency path have the the same number of
+        // observations.
+        // It's technically lying and it would be better if each metric had
+        // a separate label or similar for cancelled requests.
+        // But we don't have that right now and counter pairs balancing
+        // out is useful when using the metrics in panels and whatnot.
+        let now = Instant::now();
+        self.observe_throttle_start(now);
+        self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
+        self.observe_execution_start(now);
+        self.observe_execution_end_flush_start(now);
     }
 }
 
@@ -1345,12 +1417,12 @@ impl SmgrOpFlushInProgress {
     {
         let mut fut = std::pin::pin!(fut);
 
-        let now = Instant::now();
         // Whenever observe_guard gets called, or dropped,
         // it adds the time elapsed since its last call to metrics.
         // Last call is tracked in `now`.
         let mut observe_guard = scopeguard::guard(
             || {
+                let now = Instant::now();
                 let elapsed = now - self.flush_started_at;
                 self.global_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
@@ -1393,7 +1465,6 @@ pub enum SmgrQueryType {
     GetSlruSegment,
 }
 
-#[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
     global_started: [IntCounter; SmgrQueryType::COUNT],
     global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1405,6 +1476,7 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_flush_in_progress_micros: IntCounter,
     global_batch_wait_time: Histogram,
     per_timeline_batch_wait_time: Histogram,
+    throttling: Arc<tenant_throttling::Pagestream>,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1610,7 +1682,11 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
 });
 
 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+    pub(crate) fn new(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
+    ) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
@@ -1671,6 +1747,7 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_flush_in_progress_micros,
             global_batch_wait_time,
             per_timeline_batch_wait_time,
+            throttling: pagestream_throttle_metrics,
         }
     }
     pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
@@ -1686,88 +1763,24 @@ impl SmgrQueryTimePerTimeline {
         SmgrOpTimer(Some(SmgrOpTimerInner {
             global_execution_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_execution_latency_histo: per_timeline_latency_histo,
-            timings: SmgrOpTimerState::Received { received_at },
             global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
             per_timeline_flush_in_progress_micros: self
                 .per_timeline_flush_in_progress_micros
                 .clone(),
             global_batch_wait_time: self.global_batch_wait_time.clone(),
             per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
+            throttling: self.throttling.clone(),
+            timings: SmgrOpTimerState::Received { received_at },
         }))
     }
 
+    /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
     pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
         self.global_batch_size.observe(batch_size as f64);
         self.per_timeline_batch_size.observe(batch_size as f64);
     }
 }
 
-#[cfg(test)]
-mod smgr_query_time_tests {
-    use std::time::Instant;
-
-    use pageserver_api::shard::TenantShardId;
-    use strum::IntoEnumIterator;
-    use utils::id::{TenantId, TimelineId};
-
-    // Regression test, we used hard-coded string constants before using an enum.
-    #[test]
-    fn op_label_name() {
-        use super::SmgrQueryType::*;
-        let expect: [(super::SmgrQueryType, &'static str); 5] = [
-            (GetRelExists, "get_rel_exists"),
-            (GetRelSize, "get_rel_size"),
-            (GetPageAtLsn, "get_page_at_lsn"),
-            (GetDbSize, "get_db_size"),
-            (GetSlruSegment, "get_slru_segment"),
-        ];
-        for (op, expect) in expect {
-            let actual: &'static str = op.into();
-            assert_eq!(actual, expect);
-        }
-    }
-
-    #[test]
-    fn basic() {
-        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
-
-        for op in &ops {
-            let tenant_id = TenantId::generate();
-            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(
-                &TenantShardId::unsharded(tenant_id),
-                &timeline_id,
-            );
-
-            let get_counts = || {
-                let global: u64 = ops
-                    .iter()
-                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
-                    .sum();
-                (
-                    global,
-                    metrics.per_timeline_getpage_latency.get_sample_count(),
-                )
-            };
-
-            let (pre_global, pre_per_tenant_timeline) = get_counts();
-            assert_eq!(pre_per_tenant_timeline, 0);
-
-            let timer = metrics.start_smgr_op(*op, Instant::now());
-            drop(timer);
-
-            let (post_global, post_per_tenant_timeline) = get_counts();
-            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
-                // getpage ops are tracked per-timeline, others aren't
-                assert_eq!(post_per_tenant_timeline, 1);
-            } else {
-                assert_eq!(post_per_tenant_timeline, 0);
-            }
-            assert!(post_global > pre_global);
-        }
-    }
-}
-
 // keep in sync with control plane Go code so that we can validate
 // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
 static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -3563,9 +3576,7 @@ pub(crate) mod tenant_throttling {
     use once_cell::sync::Lazy;
     use utils::shard::TenantShardId;
 
-    use crate::tenant::{self};
-
-    struct GlobalAndPerTenantIntCounter {
+    pub(crate) struct GlobalAndPerTenantIntCounter {
         global: IntCounter,
         per_tenant: IntCounter,
     }
@@ -3583,10 +3594,10 @@ pub(crate) mod tenant_throttling {
     }
 
     pub(crate) struct Metrics<const KIND: usize> {
-        count_accounted_start: GlobalAndPerTenantIntCounter,
-        count_accounted_finish: GlobalAndPerTenantIntCounter,
-        wait_time: GlobalAndPerTenantIntCounter,
-        count_throttled: GlobalAndPerTenantIntCounter,
+        pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
+        pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
+        pub(super) wait_time: GlobalAndPerTenantIntCounter,
+        pub(super) count_throttled: GlobalAndPerTenantIntCounter,
     }
 
     static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
@@ -3721,26 +3732,6 @@ pub(crate) mod tenant_throttling {
             }
         }
     }
-
-    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
-        #[inline(always)]
-        fn accounting_start(&self) {
-            self.count_accounted_start.inc();
-        }
-        #[inline(always)]
-        fn accounting_finish(&self) {
-            self.count_accounted_finish.inc();
-        }
-        #[inline(always)]
-        fn observe_throttling(
-            &self,
-            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
-        ) {
-            let val = u64::try_from(wait_time.as_micros()).unwrap();
-            self.wait_time.inc_by(val);
-            self.count_throttled.inc();
-        }
-    }
 }
 
 pub(crate) mod disk_usage_based_eviction {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f6504bd3b5..b3e18fed99 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -592,43 +592,21 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
-    async fn throttle_and_record_start_processing(
-        &mut self,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let (shard, tokens, timers) = match self {
-            BatchedFeMessage::Exists { shard, timer, .. }
-            | BatchedFeMessage::Nblocks { shard, timer, .. }
-            | BatchedFeMessage::DbSize { shard, timer, .. }
-            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
-                (
-                    shard,
-                    // 1 token is probably under-estimating because these
-                    // request handlers typically do several Timeline::get calls.
-                    1,
-                    itertools::Either::Left(std::iter::once(timer)),
-                )
+    fn observe_execution_start(&mut self, at: Instant) {
+        match self {
+            BatchedFeMessage::Exists { timer, .. }
+            | BatchedFeMessage::Nblocks { timer, .. }
+            | BatchedFeMessage::DbSize { timer, .. }
+            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
+                timer.observe_execution_start(at);
             }
-            BatchedFeMessage::GetPage { shard, pages, .. } => (
-                shard,
-                pages.len(),
-                itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
-            ),
-            BatchedFeMessage::RespondError { .. } => return Ok(()),
-        };
-        let throttled = tokio::select! {
-            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
-            _ = shard.cancel.cancelled() => {
-                return Err(QueryError::Shutdown);
+            BatchedFeMessage::GetPage { pages, .. } => {
+                for page in pages {
+                    page.timer.observe_execution_start(at);
+                }
             }
-            _ = cancel.cancelled() => {
-                return Err(QueryError::Shutdown);
-            }
-        };
-        for timer in timers {
-            timer.observe_throttle_done_execution_starting(&throttled);
+            BatchedFeMessage::RespondError { .. } => {}
         }
-        Ok(())
     }
 }
 
@@ -720,6 +698,26 @@ impl PageServerHandler {
         let neon_fe_msg =
             PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
+        // TODO: turn in to async closure once available to avoid repeating received_at
+        async fn record_op_start_and_throttle(
+            shard: &timeline::handle::Handle<TenantManagerTypes>,
+            op: metrics::SmgrQueryType,
+            received_at: Instant,
+        ) -> Result<SmgrOpTimer, QueryError> {
+            // It's important to start the smgr op metric recorder as early as possible
+            // so that the _started counters are incremented before we do
+            // any serious waiting, e.g., for throttle, batching, or actual request handling.
+            let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
+            let now = Instant::now();
+            timer.observe_throttle_start(now);
+            let throttled = tokio::select! {
+                res = shard.pagestream_throttle.throttle(1, now) => res,
+                _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
+            };
+            timer.observe_throttle_done(throttled);
+            Ok(timer)
+        }
+
         let batched_msg = match neon_fe_msg {
             PagestreamFeMessage::Exists(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
@@ -727,9 +725,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetRelExists,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::Exists {
                     span,
                     timer,
@@ -743,9 +744,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetRelSize,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::Nblocks {
                     span,
                     timer,
@@ -759,9 +763,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetDbSize,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::DbSize {
                     span,
                     timer,
@@ -775,9 +782,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetSlruSegment,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::GetSlruSegment {
                     span,
                     timer,
@@ -826,12 +836,12 @@ impl PageServerHandler {
                     }
                 };
 
-                // It's important to start the timer before waiting for the LSN
-                // so that the _started counters are incremented before we do
-                // any serious waiting, e.g., for LSNs.
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetPageAtLsn,
+                    received_at,
+                )
+                .await?;
 
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
@@ -937,6 +947,13 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
+        let started_at = Instant::now();
+        let batch = {
+            let mut batch = batch;
+            batch.observe_execution_start(started_at);
+            batch
+        };
+
         // invoke handler function
         let (handler_results, span): (
             Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
@@ -1103,8 +1120,11 @@ impl PageServerHandler {
             // The timer's underlying metric is used for a storage-internal latency SLO and
             // we don't want to include latency in it that we can't control.
             // And as pointed out above, in this case, we don't control the time that flush will take.
-            let flushing_timer =
-                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+            let flushing_timer = timer.map(|mut timer| {
+                timer
+                    .observe_execution_end_flush_start(Instant::now())
+                    .expect("we are the first caller")
+            });
 
             // what we want to do
             let flush_fut = pgb_writer.flush();
@@ -1258,7 +1278,7 @@ impl PageServerHandler {
                 Ok(msg) => msg,
                 Err(e) => break e,
             };
-            let mut msg = match msg {
+            let msg = match msg {
                 Some(msg) => msg,
                 None => {
                     debug!("pagestream subprotocol end observed");
@@ -1266,10 +1286,6 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
-                break cancelled;
-            }
-
             let err = self
                 .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
                 .await;
@@ -1429,15 +1445,12 @@ impl PageServerHandler {
                             return Ok(());
                         }
                     };
-                    let mut batch = match batch {
+                    let batch = match batch {
                         Ok(batch) => batch,
                         Err(e) => {
                             return Err(e);
                         }
                     };
-                    batch
-                        .throttle_and_record_start_processing(&self.cancel)
-                        .await?;
                     self.pagesteam_handle_batched_message(
                         pgb_writer,
                         batch,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 070593b104..f6d758ad22 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -365,8 +365,9 @@ pub struct Tenant {
 
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) pagestream_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
+
+    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -1687,6 +1688,7 @@ impl Tenant {
                     TimelineResources {
                         remote_client,
                         pagestream_throttle: self.pagestream_throttle.clone(),
+                        pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
                         l0_flush_global_state: self.l0_flush_global_state.clone(),
                     },
                     LoadTimelineCause::Attach,
@@ -3992,6 +3994,9 @@ impl Tenant {
         Ok(timeline)
     }
 
+    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// to ensure proper cleanup of background tasks and metrics.
+    //
     // Allow too_many_arguments because a constructor's argument list naturally grows with the
     // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
     #[allow(clippy::too_many_arguments)]
@@ -4100,8 +4105,10 @@ impl Tenant {
             gate: Gate::default(),
             pagestream_throttle: Arc::new(throttle::Throttle::new(
                 Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
             )),
+            pagestream_throttle_metrics: Arc::new(
+                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
+            ),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
             gc_block: Default::default(),
@@ -5008,6 +5015,7 @@ impl Tenant {
         TimelineResources {
             remote_client: self.build_timeline_remote_client(timeline_id),
             pagestream_throttle: self.pagestream_throttle.clone(),
+            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 8ab6a0e060..300d779125 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -3,7 +3,7 @@ use std::{
         atomic::{AtomicU64, Ordering},
         Arc,
     },
-    time::{Duration, Instant},
+    time::Instant,
 };
 
 use arc_swap::ArcSwap;
@@ -16,9 +16,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 /// To share a throttle among multiple entities, wrap it in an [`Arc`].
 ///
 /// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
-pub struct Throttle<M: Metric> {
+pub struct Throttle {
     inner: ArcSwap<Inner>,
-    metric: M,
     /// will be turned into [`Stats::count_accounted_start`]
     count_accounted_start: AtomicU64,
     /// will be turned into [`Stats::count_accounted_finish`]
@@ -36,15 +35,6 @@ pub struct Inner {
 
 pub type Config = pageserver_api::models::ThrottleConfig;
 
-pub struct Observation {
-    pub wait_time: Duration,
-}
-pub trait Metric {
-    fn accounting_start(&self);
-    fn accounting_finish(&self);
-    fn observe_throttling(&self, observation: &Observation);
-}
-
 /// See [`Throttle::reset_stats`].
 pub struct Stats {
     /// Number of requests that started [`Throttle::throttle`] calls.
@@ -59,18 +49,14 @@ pub struct Stats {
 }
 
 pub enum ThrottleResult {
-    NotThrottled { start: Instant },
-    Throttled { start: Instant, end: Instant },
+    NotThrottled { end: Instant },
+    Throttled { end: Instant },
 }
 
-impl<M> Throttle<M>
-where
-    M: Metric,
-{
-    pub fn new(config: Config, metric: M) -> Self {
+impl Throttle {
+    pub fn new(config: Config) -> Self {
         Self {
             inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
-            metric,
             count_accounted_start: AtomicU64::new(0),
             count_accounted_finish: AtomicU64::new(0),
             count_throttled: AtomicU64::new(0),
@@ -127,32 +113,27 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
+    /// `start` must be [`Instant::now`] or earlier.
+    pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
 
-        let start = std::time::Instant::now();
-
         if !inner.enabled {
-            return ThrottleResult::NotThrottled { start };
+            return ThrottleResult::NotThrottled { end: start };
         }
 
-        self.metric.accounting_start();
         self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
         self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
-        self.metric.accounting_finish();
 
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
-            let now = Instant::now();
-            let wait_time = now - start;
+            let end = Instant::now();
+            let wait_time = end - start;
             self.sum_throttled_usecs
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
-            let observation = Observation { wait_time };
-            self.metric.observe_throttling(&observation);
-            ThrottleResult::Throttled { start, end: now }
+            ThrottleResult::Throttled { end }
         } else {
-            ThrottleResult::NotThrottled { start }
+            ThrottleResult::NotThrottled { end: start }
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 741b214a73..4aa6b7a05a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub pagestream_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
+    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -412,8 +412,7 @@ pub struct Timeline {
     gc_lock: tokio::sync::Mutex<()>,
 
     /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
-    pub(crate) pagestream_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
 
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -2310,6 +2309,7 @@ impl Timeline {
                 query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                     &tenant_shard_id,
                     &timeline_id,
+                    resources.pagestream_throttle_metrics,
                 ),
 
                 directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index ae44af3fad..bdc315d985 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -301,6 +301,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     pagestream_throttle: tenant.pagestream_throttle.clone(),
+                    pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
                     l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.

From aa7323a384dd60b8c176ba6bed6a3c2e5f9bd95e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 Jan 2025 15:30:43 +0000
Subject: [PATCH 28/32] storage controller: quality of life improvements for AZ
 handling (#10379)

## Problem

Since https://github.com/neondatabase/neon/pull/9916, the preferred AZ
of a tenant is much more impactful, and we would like to make it more
visible in tooling.

## Summary of changes

- Include AZ in node describe API
- Include AZ info in node & tenant outputs in CLI
- Add metrics for per-node shard counts, labelled by AZ
- Add a CLI for setting preferred AZ on a tenant
- Extend AZ-setting API+CLI to handle None for clearing preferred AZ
---
 control_plane/storcon_cli/src/main.rs     | 120 ++++++++++++++++++++--
 libs/pageserver_api/src/controller_api.rs |   4 +-
 storage_controller/src/metrics.rs         |  19 ++++
 storage_controller/src/node.rs            |   1 +
 storage_controller/src/persistence.rs     |   7 +-
 storage_controller/src/scheduler.rs       |  29 +++++-
 storage_controller/src/service.rs         |   9 +-
 storage_controller/src/tenant_shard.rs    |   4 +-
 8 files changed, 175 insertions(+), 18 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 1653b3c845..9d133e4af1 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,12 +1,16 @@
 use futures::StreamExt;
-use std::{str::FromStr, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    time::Duration,
+};
 
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -153,6 +157,12 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    TenantSetPreferredAz {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        preferred_az: Option<String>,
+    },
     /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
     /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
     TenantDrop {
@@ -402,11 +412,12 @@ async fn main() -> anyhow::Result<()> {
             resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
 
             let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
             for node in resp {
                 table.add_row([
                     format!("{}", node.id),
                     node.listen_http_addr,
+                    node.availability_zone_id,
                     format!("{:?}", node.scheduling),
                     format!("{:?}", node.availability),
                 ]);
@@ -479,6 +490,7 @@ async fn main() -> anyhow::Result<()> {
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
+                "Preferred AZ",
                 "ShardCount",
                 "StripeSize",
                 "Placement",
@@ -488,6 +500,11 @@ async fn main() -> anyhow::Result<()> {
                 let shard_zero = tenant.shards.into_iter().next().unwrap();
                 table.add_row([
                     format!("{}", tenant.tenant_id),
+                    shard_zero
+                        .preferred_az_id
+                        .as_ref()
+                        .cloned()
+                        .unwrap_or("".to_string()),
                     format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                     format!("{:?}", tenant.stripe_size),
                     format!("{:?}", tenant.policy),
@@ -614,6 +631,19 @@ async fn main() -> anyhow::Result<()> {
                     None,
                 )
                 .await?;
+
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let nodes = nodes
+                .into_iter()
+                .map(|n| (n.id, n))
+                .collect::<HashMap<_, _>>();
+
             println!("Tenant {tenant_id}");
             let mut table = comfy_table::Table::new();
             table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -622,7 +652,14 @@ async fn main() -> anyhow::Result<()> {
             println!("{table}");
             println!("Shards:");
             let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            table.set_header([
+                "Shard",
+                "Attached",
+                "Attached AZ",
+                "Secondary",
+                "Last error",
+                "status",
+            ]);
             for shard in shards {
                 let secondary = shard
                     .node_secondary
@@ -645,11 +682,18 @@ async fn main() -> anyhow::Result<()> {
                 }
                 let status = status_parts.join(",");
 
+                let attached_node = shard
+                    .node_attached
+                    .as_ref()
+                    .map(|id| nodes.get(id).expect("Shard references nonexistent node"));
+
                 table.add_row([
                     format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
+                    attached_node
+                        .map(|n| format!("{} ({})", n.listen_http_addr, n.id))
+                        .unwrap_or(String::new()),
+                    attached_node
+                        .map(|n| n.availability_zone_id.clone())
                         .unwrap_or(String::new()),
                     secondary,
                     shard.last_error,
@@ -658,6 +702,66 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::TenantSetPreferredAz {
+            tenant_id,
+            preferred_az,
+        } => {
+            // First learn about the tenant's shards
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            // Learn about nodes to validate the AZ ID
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            if let Some(preferred_az) = &preferred_az {
+                let azs = nodes
+                    .into_iter()
+                    .map(|n| (n.availability_zone_id))
+                    .collect::<HashSet<_>>();
+                if !azs.contains(preferred_az) {
+                    anyhow::bail!(
+                        "AZ {} not found on any node: known AZs are: {:?}",
+                        preferred_az,
+                        azs
+                    );
+                }
+            } else {
+                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
+                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
+            }
+
+            // Construct a request that modifies all the tenant's shards
+            let req = ShardsPreferredAzsRequest {
+                preferred_az_ids: describe_response
+                    .shards
+                    .into_iter()
+                    .map(|s| {
+                        (
+                            s.tenant_shard_id,
+                            preferred_az.clone().map(AvailabilityZone),
+                        )
+                    })
+                    .collect(),
+            };
+            storcon_client
+                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                    Method::PUT,
+                    "control/v1/preferred_azs".to_string(),
+                    Some(req),
+                )
+                .await?;
+        }
         Command::TenantWarmup { tenant_id } => {
             let describe_response = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f3aefc6df9..f3880cb766 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
     #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
+    pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
     pub availability: NodeAvailabilityWrapper,
     pub scheduling: NodeSchedulingPolicy,
 
+    pub availability_zone_id: String,
+
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 6d5885eba6..4164e3dc2b 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup {
     /// How many shards are not scheduled into their preferred AZ
     pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
 
+    /// How many shard locations (secondary or attached) on each node
+    pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
+    /// How many _attached_ shard locations on each node
+    pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
+    /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
+    /// preferred AZ)
+    pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
     /// How many shards would like to reconcile but were blocked by concurrency limits
     pub(crate) storage_controller_pending_reconciles: measured::Gauge,
 
@@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics {
     }
 }
 
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = NodeLabelGroupSet)]
+pub(crate) struct NodeLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) az: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) node_id: &'a str,
+}
+
 #[derive(measured::LabelGroup)]
 #[label(set = ReconcileCompleteLabelGroupSet)]
 pub(crate) struct ReconcileCompleteLabelGroup {
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 4cc9b0070d..f5c2d329e0 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -299,6 +299,7 @@ impl Node {
             id: self.id,
             availability: self.availability.clone().into(),
             scheduling: self.scheduling,
+            availability_zone_id: self.availability_zone_id.0.clone(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index beb014f0a8..eb0bfc879e 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -708,10 +708,11 @@ impl Persistence {
         Ok(())
     }
 
+    /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
     pub(crate) async fn set_tenant_shard_preferred_azs(
         &self,
-        preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
-    ) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
+        preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
@@ -722,7 +723,7 @@ impl Persistence {
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.0.clone()))
+                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
                     .execute(conn)?;
 
                 if updated == 1 {
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 04a594dcac..f5cab9dd57 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
+use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
 use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
 use serde::Serialize;
@@ -872,6 +872,33 @@ impl Scheduler {
     pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
         self.nodes.get(&node_id).unwrap().attached_shard_count
     }
+
+    /// Some metrics that we only calculate periodically: this is simpler than
+    /// rigorously updating them on every change.
+    pub(crate) fn update_metrics(&self) {
+        for (node_id, node) in &self.nodes {
+            let node_id_str = format!("{}", node_id);
+            let label_group = NodeLabelGroup {
+                az: &node.az.0,
+                node_id: &node_id_str,
+            };
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_shards
+                .set(label_group.clone(), node.shard_count as i64);
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_attached_shards
+                .set(label_group.clone(), node.attached_shard_count as i64);
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_home_shards
+                .set(label_group.clone(), node.home_shard_count as i64);
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index dadcc44cfb..cbb9103880 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2517,7 +2517,7 @@ impl Service {
                     .map(|t| {
                         (
                             t.get_tenant_shard_id().expect("Corrupt shard in database"),
-                            load_in_az.clone(),
+                            Some(load_in_az.clone()),
                         )
                     })
                     .collect(),
@@ -6390,7 +6390,7 @@ impl Service {
     /// available.  A return value of 0 indicates that everything is fully reconciled already.
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
         // This function is an efficient place to update lazy statistics, since we are walking
@@ -6451,6 +6451,9 @@ impl Service {
             }
         }
 
+        // Some metrics are calculated from SchedulerNode state, update these periodically
+        scheduler.update_metrics();
+
         // Process any deferred tenant drops
         for (tenant_id, guard) in drop_detached_tenants {
             self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
@@ -6509,7 +6512,7 @@ impl Service {
                 // Shard was dropped between planning and execution;
                 continue;
             };
-            tracing::info!("Applying optimization: {optimization:?}");
+            tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
                 if self.maybe_reconcile_shard(shard, nodes).is_some() {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2ba2a57eba..79ed628c25 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1708,8 +1708,8 @@ impl TenantShard {
         self.intent.preferred_az_id.as_ref()
     }
 
-    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
-        self.intent.preferred_az_id = Some(preferred_az_id);
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option<AvailabilityZone>) {
+        self.intent.preferred_az_id = preferred_az_id;
     }
 
     /// Returns all the nodes to which this tenant shard is attached according to the

From ffaa52ff5d3f36becaf70be2b6053c7423deba61 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 14 Jan 2025 17:31:59 +0100
Subject: [PATCH 29/32] pageserver: reorder upload queue when possible (#10218)

## Problem

The upload queue currently sees significant head-of-line blocking. For
example, index uploads act as upload barriers, and for every layer flush
we schedule a layer and index upload, which effectively serializes layer
uploads.

Resolves #10096.

## Summary of changes

Allow upload queue operations to bypass the queue if they don't conflict
with preceding operations, increasing parallelism.

NB: the upload queue currently schedules an explicit barrier after every
layer flush as well (see #8550). This must be removed to enable
parallelism. This will require a better mechanism for compaction
backpressure, see e.g. #8390 or #5415.
---
 Cargo.lock                                    |   1 +
 pageserver/Cargo.toml                         |   5 +
 pageserver/benches/upload_queue.rs            |  86 ++
 pageserver/src/tenant/metadata.rs             |   1 -
 .../src/tenant/remote_timeline_client.rs      | 101 +-
 .../tenant/remote_timeline_client/index.rs    |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   2 +-
 pageserver/src/tenant/upload_queue.rs         | 957 +++++++++++++++++-
 8 files changed, 1029 insertions(+), 126 deletions(-)
 create mode 100644 pageserver/benches/upload_queue.rs

diff --git a/Cargo.lock b/Cargo.lock
index 08453120c7..1e29f4fc08 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4044,6 +4044,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "postgres_initdb",
+ "pprof",
  "pq_proto",
  "procfs",
  "rand 0.8.5",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 140b287ccc..8547746d94 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 postgres_initdb.workspace = true
+pprof.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -108,3 +109,7 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
+
+[[bench]]
+name = "upload_queue"
+harness = false
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
new file mode 100644
index 0000000000..528b3d5490
--- /dev/null
+++ b/pageserver/benches/upload_queue.rs
@@ -0,0 +1,86 @@
+//! Upload queue benchmarks.
+
+use std::str::FromStr as _;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pageserver::tenant::metadata::TimelineMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
+use pageserver::tenant::IndexPart;
+use pprof::criterion::{Output, PProfProfiler};
+use utils::generation::Generation;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_upload_queue_next_ready,
+);
+criterion_main!(benches);
+
+/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
+/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
+/// queue as a whole is thus quadratic.
+///
+/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
+/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
+fn bench_upload_queue_next_ready(c: &mut Criterion) {
+    let mut g = c.benchmark_group("upload_queue_next_ready");
+    for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
+        g.bench_function(format!("inprogress={inprogress}"), |b| {
+            run_bench(b, inprogress).unwrap()
+        });
+    }
+
+    fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
+        // Construct two layers. layer0 is in the indexes, layer1 will be deleted.
+        let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+        let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+
+        let metadata = LayerFileMetadata {
+            shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
+            generation: Generation::Valid(1),
+            file_size: 0,
+        };
+
+        // Construct the (initial and uploaded) index with layer0.
+        let mut index = IndexPart::empty(TimelineMetadata::example());
+        index.layer_metadata.insert(layer0, metadata.clone());
+
+        // Construct the queue.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&index)?;
+
+        // Populate inprogress_tasks with a bunch of layer1 deletions.
+        let delete = UploadOp::Delete(Delete {
+            layers: vec![(layer1, metadata)],
+        });
+
+        for task_id in 0..(inprogress as u64) {
+            queue.inprogress_tasks.insert(
+                task_id,
+                Arc::new(UploadTask {
+                    task_id,
+                    retries: AtomicU32::new(0),
+                    op: delete.clone(),
+                }),
+            );
+        }
+
+        // Benchmark index upload scheduling.
+        let index_upload = UploadOp::UploadMetadata {
+            uploaded: Box::new(index),
+        };
+
+        b.iter(|| {
+            queue.queued_operations.push_front(index_upload.clone());
+            assert!(queue.next_ready().is_some());
+        });
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 24440d4b35..d281eb305f 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -320,7 +320,6 @@ impl TimelineMetadata {
 
     // Checksums make it awkward to build a valid instance by hand.  This helper
     // provides a TimelineMetadata with a valid checksum in its header.
-    #[cfg(test)]
     pub fn example() -> Self {
         let instance = Self::new(
             "0/16960E8".parse::<Lsn>().unwrap(),
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 813111245d..75e8da496d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -63,22 +63,18 @@
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
 //! described above.
+//!
 //! From the user's perspective, the operations are executed sequentially.
 //! Internally, the client knows which operations can be performed in parallel,
 //! and which operations act like a "barrier" that require preceding operations
 //! to finish. The calling code just needs to call the schedule-functions in the
 //! correct order, and the client will parallelize the operations in a way that
-//! is safe.
-//!
-//! The caller should be careful with deletion, though. They should not delete
-//! local files that have been scheduled for upload but not yet finished uploading.
-//! Otherwise the upload will fail. To wait for an upload to finish, use
-//! the 'wait_completion' function (more on that later.)
+//! is safe. For more details, see `UploadOp::can_bypass`.
 //!
 //! All of this relies on the following invariants:
 //!
 //! - We rely on read-after write consistency in the remote storage.
-//! - Layer files are immutable
+//! - Layer files are immutable.
 //!
 //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
 //! storage. Different tenants can be attached to different pageservers, but if the
@@ -1855,57 +1851,17 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    ///
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
     ///
-    /// The caller needs to already hold the `upload_queue` lock.
+    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
+    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
+    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(next_op) = upload_queue.queued_operations.front() {
-            // Can we run this task now?
-            let can_run_now = match next_op {
-                UploadOp::UploadLayer(..) => {
-                    // Can always be scheduled.
-                    true
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    // These can only be performed after all the preceding operations
-                    // have finished.
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-                UploadOp::Delete(..) => {
-                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
-                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
-                }
+        while let Some(mut next_op) = upload_queue.next_ready() {
+            debug!("starting op: {next_op}");
 
-                UploadOp::Barrier(_) | UploadOp::Shutdown => {
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-            };
-
-            // If we cannot launch this task, don't look any further.
-            //
-            // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
-            // them now, but we don't try to do that currently.  For example, if the frontmost task
-            // is an index-file upload that cannot proceed until preceding uploads have finished, we
-            // could still start layer uploads that were scheduled later.
-            if !can_run_now {
-                break;
-            }
-
-            if let UploadOp::Shutdown = next_op {
-                // leave the op in the queue but do not start more tasks; it will be dropped when
-                // the stop is called.
-                upload_queue.shutdown_ready.close();
-                break;
-            }
-
-            // We can launch this task. Remove it from the queue first.
-            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
-
-            debug!("starting op: {}", next_op);
-
-            // Update the counters and prepare
+            // Prepare upload.
             match &mut next_op {
                 UploadOp::UploadLayer(layer, meta, mode) => {
                     if upload_queue
@@ -1916,18 +1872,14 @@ impl RemoteTimelineClient {
                     } else {
                         *mode = Some(OpType::MayReorder)
                     }
-                    upload_queue.num_inprogress_layer_uploads += 1;
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    upload_queue.num_inprogress_metadata_uploads += 1;
                 }
+                UploadOp::UploadMetadata { .. } => {}
                 UploadOp::Delete(Delete { layers }) => {
                     for (name, meta) in layers {
                         upload_queue
                             .recently_deleted
                             .insert((name.clone(), meta.generation));
                     }
-                    upload_queue.num_inprogress_deletions += 1;
                 }
                 UploadOp::Barrier(sender) => {
                     sender.send_replace(());
@@ -2027,6 +1979,8 @@ impl RemoteTimelineClient {
 
             let upload_result: anyhow::Result<()> = match &task.op {
                 UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    // TODO: check if this mechanism can be removed now that can_bypass() performs
+                    // conflict checks during scheduling.
                     if let Some(OpType::FlushDeletion) = mode {
                         if self.config.read().unwrap().block_deletions {
                             // Of course, this is not efficient... but usually the queue should be empty.
@@ -2249,13 +2203,8 @@ impl RemoteTimelineClient {
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
             let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _, _) => {
-                    upload_queue.num_inprogress_layer_uploads -= 1;
-                    None
-                }
+                UploadOp::UploadLayer(_, _, _) => None,
                 UploadOp::UploadMetadata { ref uploaded } => {
-                    upload_queue.num_inprogress_metadata_uploads -= 1;
-
                     // the task id is reused as a monotonicity check for storing the "clean"
                     // IndexPart.
                     let last_updater = upload_queue.clean.1;
@@ -2289,10 +2238,7 @@ impl RemoteTimelineClient {
                         None
                     }
                 }
-                UploadOp::Delete(_) => {
-                    upload_queue.num_inprogress_deletions -= 1;
-                    None
-                }
+                UploadOp::Delete(_) => None,
                 UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
             };
 
@@ -2416,9 +2362,6 @@ impl RemoteTimelineClient {
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
                             .clone(),
-                        num_inprogress_layer_uploads: 0,
-                        num_inprogress_metadata_uploads: 0,
-                        num_inprogress_deletions: 0,
                         inprogress_tasks: HashMap::default(),
                         queued_operations: VecDeque::default(),
                         #[cfg(feature = "testing")]
@@ -2445,14 +2388,6 @@ impl RemoteTimelineClient {
                     }
                 };
 
-                // consistency check
-                assert_eq!(
-                    qi.num_inprogress_layer_uploads
-                        + qi.num_inprogress_metadata_uploads
-                        + qi.num_inprogress_deletions,
-                    qi.inprogress_tasks.len()
-                );
-
                 // We don't need to do anything here for in-progress tasks. They will finish
                 // on their own, decrement the unfinished-task counter themselves, and observe
                 // that the queue is Stopped.
@@ -2899,8 +2834,8 @@ mod tests {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
             assert!(upload_queue.queued_operations.is_empty());
-            assert!(upload_queue.inprogress_tasks.len() == 2);
-            assert!(upload_queue.num_inprogress_layer_uploads == 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 2);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
 
             // also check that `latest_file_changes` was updated
             assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2970,8 +2905,8 @@ mod tests {
             // Deletion schedules upload of the index file, and the file deletion itself
             assert_eq!(upload_queue.queued_operations.len(), 2);
             assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
+            assert_eq!(upload_queue.num_inprogress_deletions(), 0);
             assert_eq!(
                 upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                 0
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 51f093cb87..244be5bbb7 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -104,7 +104,7 @@ impl IndexPart {
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+    pub fn empty(metadata: TimelineMetadata) -> Self {
         IndexPart {
             version: Self::LATEST_VERSION,
             layer_metadata: Default::default(),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8933e8ceb1..2b06c88e8b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1812,7 +1812,7 @@ enum LayerKind {
 
 /// Guard for forcing a layer be resident while it exists.
 #[derive(Clone)]
-pub(crate) struct ResidentLayer {
+pub struct ResidentLayer {
     owner: Layer,
     downloaded: Arc<DownloadedLayer>,
 }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index ef3aa759f3..bd524e8153 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,28 +1,33 @@
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::fmt::Debug;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use super::remote_timeline_client::is_same_remote_layer_path;
+use super::storage_layer::AsLayerDesc as _;
 use super::storage_layer::LayerName;
 use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use std::collections::HashSet;
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
+use utils::generation::Generation;
+use utils::lsn::{AtomicLsn, Lsn};
 
 use chrono::NaiveDateTime;
-use std::sync::Arc;
+use once_cell::sync::Lazy;
 use tracing::info;
-use utils::lsn::AtomicLsn;
 
-use std::sync::atomic::AtomicU32;
-use utils::lsn::Lsn;
-
-use utils::generation::Generation;
+/// Kill switch for upload queue reordering in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
 
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(super) enum UploadQueue {
+pub enum UploadQueue {
     Uninitialized,
     Initialized(UploadQueueInitialized),
     Stopped(UploadQueueStopped),
@@ -39,13 +44,13 @@ impl UploadQueue {
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
-pub(crate) enum OpType {
+pub enum OpType {
     MayReorder,
     FlushDeletion,
 }
 
 /// This keeps track of queued and in-progress tasks.
-pub(crate) struct UploadQueueInitialized {
+pub struct UploadQueueInitialized {
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
@@ -70,21 +75,16 @@ pub(crate) struct UploadQueueInitialized {
     /// we skip validation)
     pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
-    // Breakdown of different kinds of tasks currently in-progress
-    pub(crate) num_inprogress_layer_uploads: usize,
-    pub(crate) num_inprogress_metadata_uploads: usize,
-    pub(crate) num_inprogress_deletions: usize,
-
     /// Tasks that are currently in-progress. In-progress means that a tokio Task
     /// has been launched for it. An in-progress task can be busy uploading, but it can
     /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
     /// be waiting for retry in `exponential_backoff`.
-    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+    pub inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
 
     /// Queued operations that have not been launched yet. They might depend on previous
     /// tasks to finish. For example, metadata upload cannot be performed before all
     /// preceding layer file uploads have completed.
-    pub(crate) queued_operations: VecDeque<UploadOp>,
+    pub queued_operations: VecDeque<UploadOp>,
 
     /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
     /// for error logging.
@@ -122,6 +122,129 @@ impl UploadQueueInitialized {
         let lsn = self.clean.0.metadata.disk_consistent_lsn();
         self.clean.1.map(|_| lsn)
     }
+
+    /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily
+    /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
+    /// the queue if it doesn't conflict with operations ahead of it.
+    ///
+    /// None may be returned even if the queue isn't empty, if no operations are ready yet.
+    pub fn next_ready(&mut self) -> Option<UploadOp> {
+        // NB: this is quadratic, but queues are expected to be small.
+        for (i, candidate) in self.queued_operations.iter().enumerate() {
+            // If this candidate is ready, go for it. Otherwise, try the next one.
+            if self.is_ready(i) {
+                // Shutdown operations are left at the head of the queue, to prevent further
+                // operations from starting. Signal that we're ready to shut down.
+                if matches!(candidate, UploadOp::Shutdown) {
+                    assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks");
+                    assert_eq!(i, 0, "shutdown not at head of queue");
+                    self.shutdown_ready.close();
+                    return None;
+                }
+
+                return self.queued_operations.remove(i);
+            }
+
+            // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
+            if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) {
+                return None;
+            }
+
+            // If upload queue reordering is disabled, bail out after the first operation.
+            if *DISABLE_UPLOAD_QUEUE_REORDERING {
+                return None;
+            }
+        }
+        None
+    }
+
+    /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if
+    /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are
+    /// allowed to skip the queue when it's safe to do so, to increase parallelism.
+    ///
+    /// The position must be valid for the queue size.
+    fn is_ready(&self, pos: usize) -> bool {
+        let candidate = self.queued_operations.get(pos).expect("invalid position");
+        self
+            // Look at in-progress operations, in random order.
+            .inprogress_tasks
+            .values()
+            .map(|task| &task.op)
+            // Then queued operations ahead of the candidate, front-to-back.
+            .chain(self.queued_operations.iter().take(pos))
+            // Keep track of the active index ahead of each operation. This is used to ensure that
+            // an upload doesn't skip the queue too far, such that it modifies a layer that's
+            // referenced by an active index.
+            //
+            // It's okay that in-progress operations are emitted in random order above, since at
+            // most one of them can be an index upload (enforced by can_bypass).
+            .scan(&self.clean.0, |next_active_index, op| {
+                let active_index = *next_active_index;
+                if let UploadOp::UploadMetadata { ref uploaded } = op {
+                    *next_active_index = uploaded; // stash index for next operation after this
+                }
+                Some((op, active_index))
+            })
+            // Check if the candidate can bypass all of them.
+            .all(|(op, active_index)| candidate.can_bypass(op, active_index))
+    }
+
+    /// Returns the number of in-progress deletion operations.
+    #[cfg(test)]
+    pub(crate) fn num_inprogress_deletions(&self) -> usize {
+        self.inprogress_tasks
+            .iter()
+            .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_)))
+            .count()
+    }
+
+    /// Returns the number of in-progress layer uploads.
+    #[cfg(test)]
+    pub(crate) fn num_inprogress_layer_uploads(&self) -> usize {
+        self.inprogress_tasks
+            .iter()
+            .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _)))
+            .count()
+    }
+
+    /// Test helper that schedules all ready operations into inprogress_tasks, and returns
+    /// references to them.
+    ///
+    /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
+    /// UploadQueue, so we can use the same code path.
+    #[cfg(test)]
+    fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
+        let mut tasks = Vec::new();
+        // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
+        while let Some(op) = self.next_ready() {
+            self.task_counter += 1;
+            let task = Arc::new(UploadTask {
+                task_id: self.task_counter,
+                op,
+                retries: 0.into(),
+            });
+            self.inprogress_tasks.insert(task.task_id, task.clone());
+            tasks.push(task);
+        }
+        tasks
+    }
+
+    /// Test helper that marks an operation as completed, removing it from inprogress_tasks.
+    ///
+    /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
+    /// UploadQueue, so we can use the same code path.
+    #[cfg(test)]
+    fn complete(&mut self, task_id: u64) {
+        let Some(task) = self.inprogress_tasks.remove(&task_id) else {
+            return;
+        };
+        // Update the clean index on uploads.
+        if let UploadOp::UploadMetadata { ref uploaded } = task.op {
+            if task.task_id > self.clean.1.unwrap_or_default() {
+                self.clean = (*uploaded.clone(), Some(task.task_id));
+            }
+        }
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -131,12 +254,12 @@ pub(super) enum SetDeletedFlagProgress {
     Successful(NaiveDateTime),
 }
 
-pub(super) struct UploadQueueStoppedDeletable {
+pub struct UploadQueueStoppedDeletable {
     pub(super) upload_queue_for_deletion: UploadQueueInitialized,
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
-pub(super) enum UploadQueueStopped {
+pub enum UploadQueueStopped {
     Deletable(UploadQueueStoppedDeletable),
     Uninitialized,
 }
@@ -163,7 +286,7 @@ impl NotInitialized {
 }
 
 impl UploadQueue {
-    pub(crate) fn initialize_empty_remote(
+    pub fn initialize_empty_remote(
         &mut self,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
@@ -185,9 +308,6 @@ impl UploadQueue {
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
             inprogress_tasks: HashMap::new(),
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
@@ -202,7 +322,7 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialize_with_current_remote_index_part(
+    pub fn initialize_with_current_remote_index_part(
         &mut self,
         index_part: &IndexPart,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
@@ -227,9 +347,6 @@ impl UploadQueue {
             ),
             // what follows are boring default initializations
             task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
             inprogress_tasks: HashMap::new(),
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
@@ -244,9 +361,7 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialized_mut(
-        &mut self,
-    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
+    pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> {
         use UploadQueue::*;
         match self {
             Uninitialized => Err(NotInitialized::Uninitialized),
@@ -276,23 +391,23 @@ impl UploadQueue {
 
 /// An in-progress upload or delete task.
 #[derive(Debug)]
-pub(crate) struct UploadTask {
+pub struct UploadTask {
     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    pub(crate) task_id: u64,
-    pub(crate) retries: AtomicU32,
+    pub task_id: u64,
+    pub retries: AtomicU32,
 
-    pub(crate) op: UploadOp,
+    pub op: UploadOp,
 }
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug, Clone)]
-pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
+pub struct Delete {
+    pub layers: Vec<(LayerName, LayerFileMetadata)>,
 }
 
-#[derive(Debug)]
-pub(crate) enum UploadOp {
+#[derive(Clone, Debug)]
+pub enum UploadOp {
     /// Upload a layer file. The last field indicates the last operation for thie file.
     UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
 
@@ -338,3 +453,765 @@ impl std::fmt::Display for UploadOp {
         }
     }
 }
+
+impl UploadOp {
+    /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the
+    /// active index when other would be uploaded -- if we allow self to bypass other, this would
+    /// be the active index when self is uploaded.
+    pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool {
+        match (self, other) {
+            // Nothing can bypass a barrier or shutdown, and it can't bypass anything.
+            (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false,
+            (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false,
+
+            // Uploads and deletes can bypass each other unless they're for the same file.
+            (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => {
+                let aname = &a.layer_desc().layer_name();
+                let bname = &b.layer_desc().layer_name();
+                !is_same_remote_layer_path(aname, ameta, bname, bmeta)
+            }
+            (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d))
+            | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => {
+                d.layers.iter().all(|(dname, dmeta)| {
+                    !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta)
+                })
+            }
+
+            // Deletes are idempotent and can always bypass each other.
+            (UploadOp::Delete(_), UploadOp::Delete(_)) => true,
+
+            // Uploads and deletes can bypass an index upload as long as neither the uploaded index
+            // nor the active index below it references the file. A layer can't be modified or
+            // deleted while referenced by an index.
+            //
+            // Similarly, index uploads can bypass uploads and deletes as long as neither the
+            // uploaded index nor the active index references the file (the latter would be
+            // incorrect use by the caller).
+            (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i })
+            | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => {
+                let uname = u.layer_desc().layer_name();
+                !i.references(&uname, umeta) && !index.references(&uname, umeta)
+            }
+            (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i })
+            | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => {
+                d.layers.iter().all(|(dname, dmeta)| {
+                    !i.references(dname, dmeta) && !index.references(dname, dmeta)
+                })
+            }
+
+            // Indexes can never bypass each other.
+            // TODO: we could coalesce them though, by only uploading the newest ready index. This
+            // is left for later, out of caution.
+            (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::tenant::storage_layer::layer::local_layer_path;
+    use crate::tenant::storage_layer::Layer;
+    use crate::tenant::Timeline;
+    use crate::DEFAULT_PG_VERSION;
+    use itertools::Itertools as _;
+    use std::str::FromStr as _;
+    use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+    /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq.
+    #[track_caller]
+    fn assert_same_op(a: &UploadOp, b: &UploadOp) {
+        use UploadOp::*;
+        match (a, b) {
+            (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => {
+                assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name());
+                assert_eq!(ameta, bmeta);
+                assert_eq!(atype, btype);
+            }
+            (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers),
+            (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b),
+            (Barrier(_), Barrier(_)) => {}
+            (Shutdown, Shutdown) => {}
+            (a, b) => panic!("{a:?} != {b:?}"),
+        }
+    }
+
+    /// Test helper which asserts that two sets of operations are the same.
+    #[track_caller]
+    fn assert_same_ops<'a>(
+        a: impl IntoIterator<Item = &'a UploadOp>,
+        b: impl IntoIterator<Item = &'a UploadOp>,
+    ) {
+        a.into_iter()
+            .zip_eq(b)
+            .for_each(|(a, b)| assert_same_op(a, b))
+    }
+
+    /// Test helper to construct a test timeline.
+    ///
+    /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to
+    /// test the upload queue -- decouple ResidentLayer from Timeline.
+    ///
+    /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to
+    /// obtain a TimelineMetadata from a Timeline.
+    fn make_timeline() -> Arc<Timeline> {
+        // Grab the current test name from the current thread name.
+        // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now.
+        let test_name = std::thread::current().name().unwrap().to_string();
+        let test_name = Box::leak(test_name.into_boxed_str());
+
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create runtime");
+
+        runtime
+            .block_on(async {
+                let harness = TenantHarness::create(test_name).await?;
+                let (tenant, ctx) = harness.load().await;
+                tenant
+                    .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                    .await
+            })
+            .expect("failed to create timeline")
+    }
+
+    /// Test helper to construct an (empty) resident layer.
+    fn make_layer(timeline: &Arc<Timeline>, name: &str) -> ResidentLayer {
+        make_layer_with_size(timeline, name, 0)
+    }
+
+    /// Test helper to construct a resident layer with the given size.
+    fn make_layer_with_size(timeline: &Arc<Timeline>, name: &str, size: usize) -> ResidentLayer {
+        let metadata = LayerFileMetadata {
+            generation: timeline.generation,
+            shard: timeline.get_shard_index(),
+            file_size: size as u64,
+        };
+        make_layer_with_metadata(timeline, name, metadata)
+    }
+
+    /// Test helper to construct a layer with the given metadata.
+    fn make_layer_with_metadata(
+        timeline: &Arc<Timeline>,
+        name: &str,
+        metadata: LayerFileMetadata,
+    ) -> ResidentLayer {
+        let name = LayerName::from_str(name).expect("invalid name");
+        let local_path = local_layer_path(
+            timeline.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &name,
+            &metadata.generation,
+        );
+        std::fs::write(&local_path, vec![0; metadata.file_size as usize])
+            .expect("failed to write file");
+        Layer::for_resident(timeline.conf, timeline, local_path, name, metadata)
+    }
+
+    /// Test helper to add a layer to an index and return a new index.
+    fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
+        let mut index = index.clone();
+        index
+            .layer_metadata
+            .insert(layer.layer_desc().layer_name(), layer.metadata());
+        Box::new(index)
+    }
+
+    /// Test helper to remove a layer from an index and return a new index.
+    fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
+        let mut index = index.clone();
+        index
+            .layer_metadata
+            .remove(&layer.layer_desc().layer_name());
+        Box::new(index)
+    }
+
+    /// Nothing can bypass a barrier, and it can't bypass inprogress tasks.
+    #[test]
+    fn schedule_barrier() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let tli = make_timeline();
+
+        let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let (barrier, _) = tokio::sync::watch::channel(());
+
+        // Enqueue non-conflicting upload, delete, and index before and after a barrier.
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::Barrier(barrier),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule the initial operations ahead of the barrier.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
+        assert!(matches!(
+            queue.queued_operations.front(),
+            Some(&UploadOp::Barrier(_))
+        ));
+
+        // Complete the initial operations. The barrier isn't scheduled while they're pending.
+        for task in tasks {
+            assert!(queue.schedule_ready().is_empty());
+            queue.complete(task.task_id);
+        }
+
+        // Schedule the barrier. The later tasks won't schedule until it completes.
+        let tasks = queue.schedule_ready();
+
+        assert_eq!(tasks.len(), 1);
+        assert!(matches!(tasks[0].op, UploadOp::Barrier(_)));
+        assert_eq!(queue.queued_operations.len(), 3);
+
+        // Complete the barrier. The rest of the tasks schedule immediately.
+        queue.complete(tasks[0].task_id);
+
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Deletes can be scheduled in parallel, even if they're for the same file.
+    #[test]
+    fn schedule_delete_parallel() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let tli = make_timeline();
+
+        // Enqueue a bunch of deletes, some with conflicting names.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::Delete(Delete {
+                layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                    (layer2.layer_desc().layer_name(), layer2.metadata()),
+                ],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule all ready operations. Since deletes don't conflict, they're all scheduled.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Conflicting uploads are serialized.
+    #[test]
+    fn schedule_upload_conflicts() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue three versions of the same layer, with different file sizes.
+        let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1);
+        let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2);
+        let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3);
+
+        let ops = [
+            UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None),
+            UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None),
+            UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Only one version should be scheduled and uploaded at a time.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.schedule_ready().is_empty());
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Conflicting uploads and deletes are serialized.
+    #[test]
+    fn schedule_upload_delete_conflicts() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue two layer uploads, with a delete of both layers in between them. These should be
+        // scheduled one at a time, since deletes can't bypass uploads and vice versa.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer0.layer_desc().layer_name(), layer0.metadata()),
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                ],
+            }),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Only one version should be scheduled and uploaded at a time.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.schedule_ready().is_empty());
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting
+    /// delete/upload operations at the head of the queue.
+    #[test]
+    fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue two layer uploads, with a delete of both layers in between them. These should be
+        // scheduled one at a time, since deletes can't bypass uploads and vice versa.
+        //
+        // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue
+        // and run immediately.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer0.layer_desc().layer_name(), layer0.metadata()),
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                ],
+            }),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations 0, 3, and 4 are scheduled immediately.
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]);
+        assert_eq!(queue.queued_operations.len(), 2);
+
+        Ok(())
+    }
+
+    /// Non-conflicting uploads are parallelized.
+    #[test]
+    fn schedule_upload_parallel() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue three different layer uploads.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // All uploads should be scheduled concurrently.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Index uploads are serialized.
+    #[test]
+    fn schedule_index_serial() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+
+        // Enqueue three uploads of the current empty index.
+        let index = Box::new(queue.clean.0.clone());
+
+        let ops = [
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // The uploads should run serially.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads.
+    /// This is the common case with layer flushes.
+    #[test]
+    fn schedule_index_upload_chain() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue three uploads of the current empty index.
+        let index = Box::new(queue.clean.0.clone());
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index0 = index_with(&index, &layer0);
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index1 = index_with(&index0, &layer1);
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index2 = index_with(&index1, &layer2);
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index0.clone(),
+            },
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index1.clone(),
+            },
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index2.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // The layer uploads should be scheduled immediately. The indexes must wait.
+        let upload_tasks = queue.schedule_ready();
+        assert_same_ops(
+            upload_tasks.iter().map(|t| &t.op),
+            [&ops[0], &ops[2], &ops[4]],
+        );
+
+        // layer2 completes first. None of the indexes can upload yet.
+        queue.complete(upload_tasks[2].task_id);
+        assert!(queue.schedule_ready().is_empty());
+
+        // layer0 completes. index0 can upload. It completes.
+        queue.complete(upload_tasks[0].task_id);
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[1]);
+        queue.complete(index_tasks[0].task_id);
+
+        // layer 1 completes. This unblocks index 1 then index 2.
+        queue.complete(upload_tasks[1].task_id);
+
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[3]);
+        queue.complete(index_tasks[0].task_id);
+
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[5]);
+        queue.complete(index_tasks[0].task_id);
+
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// A delete can't bypass an index upload if an index ahead of it still references it.
+    #[test]
+    fn schedule_index_delete_dereferenced() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Create a layer to upload.
+        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index_upload = index_with(&queue.clean.0, &layer);
+
+        // Remove the layer reference in a new index, then delete the layer.
+        let index_deref = index_without(&index_upload, &layer);
+
+        let ops = [
+            // Initial upload.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_upload.clone(),
+            },
+            // Dereference the layer and delete it.
+            UploadOp::UploadMetadata {
+                uploaded: index_deref.clone(),
+            },
+            UploadOp::Delete(Delete {
+                layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations are serialized.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a
+    /// dereference/upload/reference cycle can't allow the upload to bypass the reference.
+    #[test]
+    fn schedule_index_upload_dereferenced() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Create a layer to upload.
+        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        // Upload the layer. Then dereference the layer, and upload/reference it again.
+        let index_upload = index_with(&queue.clean.0, &layer);
+        let index_deref = index_without(&index_upload, &layer);
+        let index_ref = index_with(&index_deref, &layer);
+
+        let ops = [
+            // Initial upload.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_upload.clone(),
+            },
+            // Dereference the layer.
+            UploadOp::UploadMetadata {
+                uploaded: index_deref.clone(),
+            },
+            // Replace and reference the layer.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_ref.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations are serialized.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from
+    /// next_ready(), but is left at the head of the queue.
+    #[test]
+    fn schedule_shutdown() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let tli = make_timeline();
+
+        let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        // Enqueue non-conflicting upload, delete, and index before and after a shutdown.
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::Shutdown,
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule the initial operations ahead of the shutdown.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
+        assert!(matches!(
+            queue.queued_operations.front(),
+            Some(&UploadOp::Shutdown)
+        ));
+
+        // Complete the initial operations. The shutdown isn't triggered while they're pending.
+        for task in tasks {
+            assert!(queue.schedule_ready().is_empty());
+            queue.complete(task.task_id);
+        }
+
+        // The shutdown is triggered the next time we try to pull an operation. It isn't returned,
+        // but is left in the queue.
+        assert!(!queue.shutdown_ready.is_closed());
+        assert!(queue.next_ready().is_none());
+        assert!(queue.shutdown_ready.is_closed());
+
+        Ok(())
+    }
+
+    /// Tests that can_bypass takes name, generation and shard index into account for all operations.
+    #[test]
+    fn can_bypass_path() -> anyhow::Result<()> {
+        let tli = make_timeline();
+
+        let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
+        let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
+
+        // Asserts that layers a and b either can or can't bypass each other, for all combinations
+        // of operations (except Delete and UploadMetadata which are special-cased).
+        #[track_caller]
+        fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) {
+            let index = IndexPart::empty(TimelineMetadata::example());
+            for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) {
+                match (&a, &b) {
+                    // Deletes can always bypass each other.
+                    (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)),
+                    // Indexes can never bypass each other.
+                    (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => {
+                        assert!(!a.can_bypass(&b, &index))
+                    }
+                    // For other operations, assert as requested.
+                    (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass),
+                }
+            }
+        }
+
+        fn make_ops(layer: ResidentLayer) -> Vec<UploadOp> {
+            let mut index = IndexPart::empty(TimelineMetadata::example());
+            index
+                .layer_metadata
+                .insert(layer.layer_desc().layer_name(), layer.metadata());
+            vec![
+                UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+                UploadOp::Delete(Delete {
+                    layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
+                }),
+                UploadOp::UploadMetadata {
+                    uploaded: Box::new(index),
+                },
+            ]
+        }
+
+        // Makes a ResidentLayer.
+        let layer = |name: &'static str, shard: Option<u8>, generation: u32| -> ResidentLayer {
+            let shard = shard
+                .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8)))
+                .unwrap_or(ShardIndex::unsharded());
+            let metadata = LayerFileMetadata {
+                shard,
+                generation: Generation::Valid(generation),
+                file_size: 0,
+            };
+            make_layer_with_metadata(&tli, name, metadata)
+        };
+
+        // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as
+        // 0 or >0 generation.
+        assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false);
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false);
+        assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false);
+
+        // Different names can bypass.
+        assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true);
+
+        // Different shards can bypass. Shard 0 is different from unsharded.
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true);
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true);
+
+        // Different generations can bypass, both sharded and unsharded.
+        assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true);
+        assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true);
+
+        Ok(())
+    }
+}

From d36112d20fdc249c324308f82b2b81bb124065d9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 14 Jan 2025 19:02:35 +0200
Subject: [PATCH 30/32] Simplify compute dockerfile by setting PATH just once
 (#10357)

By setting PATH in the 'pg-build' layer, all the extension build layers
will inherit. No need to pass PG_CONFIG to all the various make
invocations either: once pg_config is in PATH, the Makefiles will pick
it up from there.
---
 compute/compute-node.Dockerfile | 168 ++++++++++++--------------------
 1 file changed, 63 insertions(+), 105 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 89cee6761f..299f4444a3 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -104,16 +104,18 @@ RUN cd postgres && \
         esac; \
     done;
 
+# Set PATH for all the subsequent build steps
+ENV PATH="/usr/local/pgsql/bin:$PATH"
+
 #########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM build-deps AS postgis-build
+FROM pg-build AS postgis-build
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
     gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -151,8 +153,6 @@ RUN case "${DEBIAN_VERSION}" in \
     DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
     ninja clean && cp -R /sfcgal/* /
 
-ENV PATH="/usr/local/pgsql/bin:$PATH"
-
 # Postgis 3.5.0 supports v17
 RUN case "${PG_VERSION}" in \
     "v17") \
@@ -227,9 +227,8 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM build-deps AS plv8-build
+FROM pg-build AS plv8-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
 
@@ -264,7 +263,6 @@ RUN case "${PG_VERSION}" in \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
@@ -291,9 +289,8 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM build-deps AS h3-pg-build
+FROM pg-build AS h3-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
@@ -314,7 +311,6 @@ RUN mkdir -p /h3/usr/ && \
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -326,17 +322,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 # compile unit extension
 #
 #########################################################################################
-FROM build-deps AS unit-pg-build
+FROM pg-build AS unit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release 7.9 - Sep 15, 2024
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
     echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
     mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
     # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
     # This one-liner removes pgsql/ part of the path.
@@ -350,9 +345,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 # compile pgvector extension
 #
 #########################################################################################
-FROM build-deps AS vector-pg-build
+FROM pg-build AS vector-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/pgvector.patch /pgvector.patch
 
@@ -366,8 +360,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
     echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################
@@ -376,16 +370,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
 # compile pgjwt extension
 #
 #########################################################################################
-FROM build-deps AS pgjwt-pg-build
+FROM pg-build AS pgjwt-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
 #########################################################################################
@@ -394,17 +387,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
 # compile hypopg extension
 #
 #########################################################################################
-FROM build-deps AS hypopg-pg-build
+FROM pg-build AS hypopg-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
     echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
 #########################################################################################
@@ -413,17 +405,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM build-deps AS pg-hashids-pg-build
+FROM pg-build AS pg-hashids-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
     mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
 
 #########################################################################################
@@ -432,9 +423,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 # compile rum extension
 #
 #########################################################################################
-FROM build-deps AS rum-pg-build
+FROM pg-build AS rum-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
@@ -445,8 +435,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
     echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
 
 #########################################################################################
@@ -455,17 +445,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
 # compile pgTAP extension
 #
 #########################################################################################
-FROM build-deps AS pgtap-pg-build
+FROM pg-build AS pgtap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
     echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
 
 #########################################################################################
@@ -474,17 +463,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
 # compile ip4r extension
 #
 #########################################################################################
-FROM build-deps AS ip4r-pg-build
+FROM pg-build AS ip4r-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
 
 #########################################################################################
@@ -493,17 +481,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 # compile Prefix extension
 #
 #########################################################################################
-FROM build-deps AS prefix-pg-build
+FROM pg-build AS prefix-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
 
 #########################################################################################
@@ -512,17 +499,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 # compile hll extension
 #
 #########################################################################################
-FROM build-deps AS hll-pg-build
+FROM pg-build AS hll-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v2.18 - Aug 29, 2023
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
 
 #########################################################################################
@@ -531,17 +517,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM build-deps AS plpgsql-check-pg-build
+FROM pg-build AS plpgsql-check-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
     echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
 
 #########################################################################################
@@ -550,11 +535,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
 # compile timescaledb extension
 #
 #########################################################################################
-FROM build-deps AS timescaledb-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS timescaledb-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -585,11 +567,8 @@ RUN case "${PG_VERSION}" in \
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM build-deps AS pg-hint-plan-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS pg-hint-plan-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 # version-specific, has separate releases for each version
 RUN case "${PG_VERSION}" in \
@@ -627,14 +606,12 @@ RUN case "${PG_VERSION}" in \
 # compile pg_cron extension
 #
 #########################################################################################
-FROM build-deps AS pg-cron-pg-build
+FROM pg-build AS pg-cron-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
     echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -648,9 +625,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
 # compile rdkit extension
 #
 #########################################################################################
-FROM build-deps AS rdkit-pg-build
+FROM pg-build AS rdkit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
@@ -668,7 +644,13 @@ RUN apt update && \
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
-ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+
 RUN case "${PG_VERSION}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
@@ -721,13 +703,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM build-deps AS pg-uuidv7-pg-build
+FROM pg-build AS pg-uuidv7-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -741,13 +721,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM build-deps AS pg-roaringbitmap-pg-build
+FROM pg-build AS pg-roaringbitmap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -761,16 +739,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 # compile pg_semver extension
 #
 #########################################################################################
-FROM build-deps AS pg-semver-pg-build
+FROM pg-build AS pg-semver-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # Release 0.40.0 breaks backward compatibility with previous versions
 # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
@@ -797,13 +773,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+FROM pg-build AS pg-embedding-pg-build
 
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -824,20 +798,18 @@ RUN case "${PG_VERSION}" in \
 # compile anon extension
 #
 #########################################################################################
-FROM build-deps AS pg-anon-pg-build
+FROM pg-build AS pg-anon-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
 
 #########################################################################################
@@ -846,9 +818,8 @@ RUN case "${PG_VERSION}" in "v17") \
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build
+FROM pg-build AS rust-extensions-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -856,7 +827,7 @@ RUN apt update && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
@@ -883,9 +854,8 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build-pgrx12
+FROM pg-build AS rust-extensions-build-pgrx12
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -893,7 +863,7 @@ RUN apt update && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
@@ -1068,13 +1038,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
 #
 #########################################################################################
 
-FROM build-deps AS wal2json-pg-build
+FROM pg-build AS wal2json-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
     echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
@@ -1087,13 +1055,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM build-deps AS pg-ivm-build
+FROM pg-build AS pg-ivm-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
     echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -1107,13 +1073,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM build-deps AS pg-partman-build
+FROM pg-build AS pg-partman-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
     echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1129,9 +1093,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 #########################################################################################
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
     echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
@@ -1147,11 +1108,8 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
 #
 #########################################################################################
 
-FROM build-deps AS pg-repack-build
+FROM pg-build AS pg-repack-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
     echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \

From e58e29e63994373e5645a5123e0be191bf693813 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 14 Jan 2025 19:01:14 +0100
Subject: [PATCH 31/32] pageserver: limit number of upload queue tasks (#10384)

## Problem

The upload queue can currently schedule an arbitrary number of tasks.
This can both spawn an unbounded number of Tokio tasks, and also
significantly slow down upload queue scheduling as it's quadratic in
number of operations.

Touches #10096.

## Summary of changes

Limit the number of inprogress tasks to the remote storage upload
concurrency. While this concurrency limit is shared across all tenants,
there's certainly no point in scheduling more than this -- we could even
consider setting the limit lower, but don't for now to avoid
artificially constraining tenants.
---
 libs/remote_storage/src/config.rs             | 11 +++
 pageserver/benches/upload_queue.rs            |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 29 +++++++-
 pageserver/src/tenant/upload_queue.rs         | 73 ++++++++++++++++---
 4 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 49b1d9dc87..dae141bf77 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -43,6 +43,17 @@ impl RemoteStorageKind {
     }
 }
 
+impl RemoteStorageConfig {
+    /// Helper to fetch the configured concurrency limit.
+    pub fn concurrency_limit(&self) -> Option<usize> {
+        match &self.storage {
+            RemoteStorageKind::LocalFs { .. } => None,
+            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+        }
+    }
+}
+
 fn default_timeout() -> Duration {
     RemoteStorageConfig::DEFAULT_TIMEOUT
 }
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
index 528b3d5490..ed644b0e3c 100644
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -53,7 +53,7 @@ fn bench_upload_queue_next_ready(c: &mut Criterion) {
 
         // Construct the queue.
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&index)?;
+        let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
 
         // Populate inprogress_tasks with a bunch of layer1 deletions.
         let delete = UploadOp::Delete(Delete {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 75e8da496d..1602765585 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -425,8 +425,16 @@ impl RemoteTimelineClient {
     /// an index file upload, i.e., it's not empty.
     /// The given `index_part` must be the one on the remote.
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
         info!(
             "initialized upload queue from remote index with {} layer files",
@@ -441,8 +449,16 @@ impl RemoteTimelineClient {
         &self,
         local_metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata)?;
+        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
         self.update_remote_physical_size_gauge(None);
         info!("initialized upload queue as empty");
         Ok(())
@@ -458,9 +474,15 @@ impl RemoteTimelineClient {
         let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
             "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
         ))?;
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
 
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
         self.stop_impl(&mut upload_queue);
 
@@ -2355,6 +2377,7 @@ impl RemoteTimelineClient {
                     // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
                     // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                     let upload_queue_for_deletion = UploadQueueInitialized {
+                        inprogress_limit: initialized.inprogress_limit,
                         task_counter: 0,
                         dirty: initialized.dirty.clone(),
                         clean: initialized.clean.clone(),
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index bd524e8153..09c8f6ad8c 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -51,6 +51,9 @@ pub enum OpType {
 
 /// This keeps track of queued and in-progress tasks.
 pub struct UploadQueueInitialized {
+    /// Maximum number of inprogress tasks to schedule. 0 is no limit.
+    pub(crate) inprogress_limit: usize,
+
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
@@ -128,8 +131,14 @@ impl UploadQueueInitialized {
     /// the queue if it doesn't conflict with operations ahead of it.
     ///
     /// None may be returned even if the queue isn't empty, if no operations are ready yet.
+    ///
+    /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
     pub fn next_ready(&mut self) -> Option<UploadOp> {
-        // NB: this is quadratic, but queues are expected to be small.
+        // If inprogress_tasks is already at limit, don't schedule anything more.
+        if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
+            return None;
+        }
+
         for (i, candidate) in self.queued_operations.iter().enumerate() {
             // If this candidate is ready, go for it. Otherwise, try the next one.
             if self.is_ready(i) {
@@ -289,6 +298,7 @@ impl UploadQueue {
     pub fn initialize_empty_remote(
         &mut self,
         metadata: &TimelineMetadata,
+        inprogress_limit: usize,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
             UploadQueue::Uninitialized => (),
@@ -302,6 +312,7 @@ impl UploadQueue {
         let index_part = IndexPart::empty(metadata.clone());
 
         let state = UploadQueueInitialized {
+            inprogress_limit,
             dirty: index_part.clone(),
             clean: (index_part, None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
@@ -325,6 +336,7 @@ impl UploadQueue {
     pub fn initialize_with_current_remote_index_part(
         &mut self,
         index_part: &IndexPart,
+        inprogress_limit: usize,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
             UploadQueue::Uninitialized => (),
@@ -339,6 +351,7 @@ impl UploadQueue {
         );
 
         let state = UploadQueueInitialized {
+            inprogress_limit,
             dirty: index_part.clone(),
             clean: (index_part.clone(), None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
@@ -633,7 +646,7 @@ mod tests {
     #[test]
     fn schedule_barrier() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
         let tli = make_timeline();
 
         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
@@ -700,7 +713,7 @@ mod tests {
     #[test]
     fn schedule_delete_parallel() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue a bunch of deletes, some with conflicting names.
@@ -745,7 +758,7 @@ mod tests {
     #[test]
     fn schedule_upload_conflicts() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue three versions of the same layer, with different file sizes.
@@ -778,7 +791,7 @@ mod tests {
     #[test]
     fn schedule_upload_delete_conflicts() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
@@ -817,7 +830,7 @@ mod tests {
     #[test]
     fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
@@ -859,7 +872,7 @@ mod tests {
     #[test]
     fn schedule_upload_parallel() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue three different layer uploads.
@@ -888,7 +901,7 @@ mod tests {
     #[test]
     fn schedule_index_serial() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
 
         // Enqueue three uploads of the current empty index.
         let index = Box::new(queue.clean.0.clone());
@@ -925,7 +938,7 @@ mod tests {
     #[test]
     fn schedule_index_upload_chain() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue three uploads of the current empty index.
@@ -994,7 +1007,7 @@ mod tests {
     #[test]
     fn schedule_index_delete_dereferenced() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Create a layer to upload.
@@ -1038,7 +1051,7 @@ mod tests {
     #[test]
     fn schedule_index_upload_dereferenced() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Create a layer to upload.
@@ -1085,7 +1098,7 @@ mod tests {
     #[test]
     fn schedule_shutdown() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
         let tli = make_timeline();
 
         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
@@ -1139,6 +1152,42 @@ mod tests {
         Ok(())
     }
 
+    /// Scheduling respects inprogress_limit.
+    #[test]
+    fn schedule_inprogress_limit() -> anyhow::Result<()> {
+        // Create a queue with inprogress_limit=2.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?;
+        let tli = make_timeline();
+
+        // Enqueue a bunch of uploads.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule all ready operations. Only 2 are scheduled.
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]);
+        assert!(queue.next_ready().is_none());
+
+        // When one completes, another is scheduled.
+        queue.complete(tasks[0].task_id);
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]);
+
+        Ok(())
+    }
+
     /// Tests that can_bypass takes name, generation and shard index into account for all operations.
     #[test]
     fn can_bypass_path() -> anyhow::Result<()> {

From 6debb49b87dcf2dcca53bf17ee005b936eac446c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 14 Jan 2025 22:10:17 +0100
Subject: [PATCH 32/32] pageserver: coalesce index uploads when possible
 (#10248)

## Problem

With upload queue reordering in #10218, we can easily get into a
situation where multiple index uploads are queued back to back, which
can't be parallelized. This will happen e.g. when multiple layer flushes
enqueue layer/index/layer/index/... and the layers skip the queue and
are uploaded in parallel.

These index uploads will incur serial S3 roundtrip latencies, and may
block later operations.

Touches #10096.

## Summary of changes

When multiple back-to-back index uploads are ready to upload, only
upload the most recent index and drop the rest.
---
 pageserver/benches/upload_queue.rs            |  1 +
 .../src/tenant/remote_timeline_client.rs      |  6 +-
 pageserver/src/tenant/upload_queue.rs         | 86 +++++++++++++------
 3 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
index ed644b0e3c..ed5daa8ae1 100644
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -67,6 +67,7 @@ fn bench_upload_queue_next_ready(c: &mut Criterion) {
                     task_id,
                     retries: AtomicU32::new(0),
                     op: delete.clone(),
+                    coalesced_ops: Vec::new(),
                 }),
             );
         }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1602765585..47c4a8637d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1880,7 +1880,7 @@ impl RemoteTimelineClient {
     /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
     /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(mut next_op) = upload_queue.next_ready() {
+        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
             debug!("starting op: {next_op}");
 
             // Prepare upload.
@@ -1918,6 +1918,7 @@ impl RemoteTimelineClient {
             let task = Arc::new(UploadTask {
                 task_id: upload_task_id,
                 op: next_op,
+                coalesced_ops,
                 retries: AtomicU32::new(0),
             });
             upload_queue
@@ -2285,6 +2286,9 @@ impl RemoteTimelineClient {
         }
 
         self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
     }
 
     fn metric_impl(
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 09c8f6ad8c..d302205ffe 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -22,6 +22,11 @@ use tracing::info;
 static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
     Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
 
+/// Kill switch for index upload coalescing in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true"));
+
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
@@ -130,10 +135,12 @@ impl UploadQueueInitialized {
     /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
     /// the queue if it doesn't conflict with operations ahead of it.
     ///
+    /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads.
+    ///
     /// None may be returned even if the queue isn't empty, if no operations are ready yet.
     ///
     /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
-    pub fn next_ready(&mut self) -> Option<UploadOp> {
+    pub fn next_ready(&mut self) -> Option<(UploadOp, Vec<UploadOp>)> {
         // If inprogress_tasks is already at limit, don't schedule anything more.
         if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
             return None;
@@ -151,7 +158,36 @@ impl UploadQueueInitialized {
                     return None;
                 }
 
-                return self.queued_operations.remove(i);
+                let mut op = self.queued_operations.remove(i).expect("i can't disappear");
+
+                // Coalesce any back-to-back index uploads by only uploading the newest one that's
+                // ready. This typically happens with layer/index/layer/index/... sequences, where
+                // the layers bypass the indexes, leaving the indexes queued.
+                //
+                // If other operations are interleaved between index uploads we don't try to
+                // coalesce them, since we may as well update the index concurrently with them.
+                // This keeps the index fresh and avoids starvation.
+                //
+                // NB: we assume that all uploaded indexes have the same remote path. This
+                // is true at the time of writing: the path only depends on the tenant,
+                // timeline and generation, all of which are static for a timeline instance.
+                // Otherwise, we must be careful not to coalesce different paths.
+                let mut coalesced_ops = Vec::new();
+                if matches!(op, UploadOp::UploadMetadata { .. }) {
+                    while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i)
+                    {
+                        if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING {
+                            break;
+                        }
+                        if !self.is_ready(i) {
+                            break;
+                        }
+                        coalesced_ops.push(op);
+                        op = self.queued_operations.remove(i).expect("i can't disappear");
+                    }
+                }
+
+                return Some((op, coalesced_ops));
             }
 
             // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
@@ -225,11 +261,12 @@ impl UploadQueueInitialized {
     fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
         let mut tasks = Vec::new();
         // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
-        while let Some(op) = self.next_ready() {
+        while let Some((op, coalesced_ops)) = self.next_ready() {
             self.task_counter += 1;
             let task = Arc::new(UploadTask {
                 task_id: self.task_counter,
                 op,
+                coalesced_ops,
                 retries: 0.into(),
             });
             self.inprogress_tasks.insert(task.task_id, task.clone());
@@ -407,9 +444,13 @@ impl UploadQueue {
 pub struct UploadTask {
     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
     pub task_id: u64,
+    /// Number of task retries.
     pub retries: AtomicU32,
-
+    /// The upload operation.
     pub op: UploadOp,
+    /// Any upload operations that were coalesced into this operation. This typically happens with
+    /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`.
+    pub coalesced_ops: Vec<UploadOp>,
 }
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
@@ -512,9 +553,8 @@ impl UploadOp {
                 })
             }
 
-            // Indexes can never bypass each other.
-            // TODO: we could coalesce them though, by only uploading the newest ready index. This
-            // is left for later, out of caution.
+            // Indexes can never bypass each other. They can coalesce though, and
+            // `UploadQueue::next_ready()` currently does this when possible.
             (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
         }
     }
@@ -897,9 +937,9 @@ mod tests {
         Ok(())
     }
 
-    /// Index uploads are serialized.
+    /// Index uploads are coalesced.
     #[test]
-    fn schedule_index_serial() -> anyhow::Result<()> {
+    fn schedule_index_coalesce() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
 
@@ -920,13 +960,11 @@ mod tests {
 
         queue.queued_operations.extend(ops.clone());
 
-        // The uploads should run serially.
-        for op in ops {
-            let tasks = queue.schedule_ready();
-            assert_eq!(tasks.len(), 1);
-            assert_same_op(&tasks[0].op, &op);
-            queue.complete(tasks[0].task_id);
-        }
+        // The index uploads are coalesced into a single operation.
+        let tasks = queue.schedule_ready();
+        assert_eq!(tasks.len(), 1);
+        assert_same_op(&tasks[0].op, &ops[2]);
+        assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]);
 
         assert!(queue.queued_operations.is_empty());
 
@@ -985,18 +1023,14 @@ mod tests {
         assert_same_op(&index_tasks[0].op, &ops[1]);
         queue.complete(index_tasks[0].task_id);
 
-        // layer 1 completes. This unblocks index 1 then index 2.
+        // layer 1 completes. This unblocks index 1 and 2, which coalesce into
+        // a single upload for index 2.
         queue.complete(upload_tasks[1].task_id);
 
-        let index_tasks = queue.schedule_ready();
-        assert_eq!(index_tasks.len(), 1);
-        assert_same_op(&index_tasks[0].op, &ops[3]);
-        queue.complete(index_tasks[0].task_id);
-
         let index_tasks = queue.schedule_ready();
         assert_eq!(index_tasks.len(), 1);
         assert_same_op(&index_tasks[0].op, &ops[5]);
-        queue.complete(index_tasks[0].task_id);
+        assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]);
 
         assert!(queue.queued_operations.is_empty());
 
@@ -1018,11 +1052,12 @@ mod tests {
         let index_deref = index_without(&index_upload, &layer);
 
         let ops = [
-            // Initial upload.
+            // Initial upload, with a barrier to prevent index coalescing.
             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
             UploadOp::UploadMetadata {
                 uploaded: index_upload.clone(),
             },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
             // Dereference the layer and delete it.
             UploadOp::UploadMetadata {
                 uploaded: index_deref.clone(),
@@ -1063,11 +1098,12 @@ mod tests {
         let index_ref = index_with(&index_deref, &layer);
 
         let ops = [
-            // Initial upload.
+            // Initial upload, with a barrier to prevent index coalescing.
             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
             UploadOp::UploadMetadata {
                 uploaded: index_upload.clone(),
             },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
             // Dereference the layer.
             UploadOp::UploadMetadata {
                 uploaded: index_deref.clone(),