From ea3798e3b30f808f2851f58ff2390150b89959c6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Nov 2024 13:27:49 +0000
Subject: [PATCH 1/6] storage controller: use proper ScheduleContext when
 evacuating a node (#9908)

## Problem

When picking locations for a shard, we should use a ScheduleContext that
includes all the other shards in the tenant, so that we apply proper
anti-affinity between shards. If we don't do this, then it can lead to
unstable scheduling, where we place a shard somewhere that the optimizer
will then immediately move it away from.

We didn't always do this, because it was a bit awkward to accumulate the
context for a tenant rather than just walking tenants.

This was a TODO in `handle_node_availability_transition`:
```
                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
                        // for tenants without secondary locations: if they have a secondary location, then this
                        // schedule() call is just promoting an existing secondary)
```

This is a precursor to https://github.com/neondatabase/neon/issues/8264,
where the current imperfect scheduling during node evacuation hampers
testing.

## Summary of changes

- Add an iterator type that yields each shard along with a
schedulecontext that includes all the other shards from the same tenant
- Use the iterator to replace hand-crafted logic in optimize_all_plan
(functionally identical)
- Use the iterator in `handle_node_availability_transition` to apply
proper anti-affinity during node evacuation.
---
 storage_controller/src/scheduler.rs           |  17 +-
 storage_controller/src/service.rs             | 200 +++++++-----------
 .../src/service/context_iterator.rs           | 139 ++++++++++++
 storage_controller/src/tenant_shard.rs        |  11 +-
 4 files changed, 245 insertions(+), 122 deletions(-)
 create mode 100644 storage_controller/src/service/context_iterator.rs

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 2414d95eb8..ecc6b11e47 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -305,7 +305,7 @@ impl std::ops::Add for AffinityScore {
 
 /// Hint for whether this is a sincere attempt to schedule, or a speculative
 /// check for where we _would_ schedule (done during optimization)
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum ScheduleMode {
     Normal,
     Speculative,
@@ -319,7 +319,7 @@ impl Default for ScheduleMode {
 
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ScheduleContext {
     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
@@ -331,6 +331,14 @@ pub(crate) struct ScheduleContext {
 }
 
 impl ScheduleContext {
+    pub(crate) fn new(mode: ScheduleMode) -> Self {
+        Self {
+            nodes: HashMap::new(),
+            attached_nodes: HashMap::new(),
+            mode,
+        }
+    }
+
     /// Input is a list of nodes we would like to avoid using again within this context.  The more
     /// times a node is passed into this call, the less inclined we are to use it.
     pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
@@ -355,6 +363,11 @@ impl ScheduleContext {
     pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
         self.attached_nodes.get(&node_id).copied().unwrap_or(0)
     }
+
+    #[cfg(test)]
+    pub(crate) fn attach_count(&self) -> usize {
+        self.attached_nodes.values().sum()
+    }
 }
 
 pub(crate) enum RefCountUpdate {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 446c476b99..636ccf11a1 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,3 +1,6 @@
+pub mod chaos_injector;
+mod context_iterator;
+
 use hyper::Uri;
 use std::{
     borrow::Cow,
@@ -95,7 +98,7 @@ use crate::{
     },
 };
 
-pub mod chaos_injector;
+use context_iterator::TenantShardContextIterator;
 
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -5498,49 +5501,51 @@ impl Service {
 
                 let mut tenants_affected: usize = 0;
 
-                for (tenant_shard_id, tenant_shard) in tenants {
-                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
-                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
-                        // not assume our knowledge of the node's configuration is accurate until it comes back online
-                        observed_loc.conf = None;
-                    }
+                for (_tenant_id, mut schedule_context, shards) in
+                    TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+                {
+                    for tenant_shard in shards {
+                        let tenant_shard_id = tenant_shard.tenant_shard_id;
+                        if let Some(observed_loc) =
+                            tenant_shard.observed.locations.get_mut(&node_id)
+                        {
+                            // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
+                            // not assume our knowledge of the node's configuration is accurate until it comes back online
+                            observed_loc.conf = None;
+                        }
 
-                    if nodes.len() == 1 {
-                        // Special case for single-node cluster: there is no point trying to reschedule
-                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
-                        // failures to schedule them.
-                        continue;
-                    }
+                        if nodes.len() == 1 {
+                            // Special case for single-node cluster: there is no point trying to reschedule
+                            // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                            // failures to schedule them.
+                            continue;
+                        }
 
-                    if !nodes
-                        .values()
-                        .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
-                    {
-                        // Special case for when all nodes are unavailable and/or unschedulable: there is no point
-                        // trying to reschedule since there's nowhere else to go. Without this
-                        // branch we incorrectly detach tenants in response to node unavailability.
-                        continue;
-                    }
+                        if !nodes
+                            .values()
+                            .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                        {
+                            // Special case for when all nodes are unavailable and/or unschedulable: there is no point
+                            // trying to reschedule since there's nowhere else to go. Without this
+                            // branch we incorrectly detach tenants in response to node unavailability.
+                            continue;
+                        }
 
-                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
-                        tenant_shard.sequence = tenant_shard.sequence.next();
+                        if tenant_shard.intent.demote_attached(scheduler, node_id) {
+                            tenant_shard.sequence = tenant_shard.sequence.next();
 
-                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
-                        // for tenants without secondary locations: if they have a secondary location, then this
-                        // schedule() call is just promoting an existing secondary)
-                        let mut schedule_context = ScheduleContext::default();
-
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
-                            Err(e) => {
-                                // It is possible that some tenants will become unschedulable when too many pageservers
-                                // go offline: in this case there isn't much we can do other than make the issue observable.
-                                // TODO: give TenantShard a scheduling error attribute to be queried later.
-                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
-                            }
-                            Ok(()) => {
-                                if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
-                                    tenants_affected += 1;
-                                };
+                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                                Err(e) => {
+                                    // It is possible that some tenants will become unschedulable when too many pageservers
+                                    // go offline: in this case there isn't much we can do other than make the issue observable.
+                                    // TODO: give TenantShard a scheduling error attribute to be queried later.
+                                    tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
+                                }
+                                Ok(()) => {
+                                    if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
+                                        tenants_affected += 1;
+                                    };
+                                }
                             }
                         }
                     }
@@ -6011,14 +6016,8 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
-        let mut schedule_context = ScheduleContext::default();
-
         let mut reconciles_spawned = 0;
-        for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_shard_zero() {
-                schedule_context = ScheduleContext::default();
-            }
-
+        for shard in tenants.values_mut() {
             // Skip checking if this shard is already enqueued for reconciliation
             if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                 // If there is something delayed, then return a nonzero count so that
@@ -6033,8 +6032,6 @@ impl Service {
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
             }
-
-            schedule_context.avoid(&shard.intent.all_pageservers());
         }
 
         reconciles_spawned
@@ -6103,95 +6100,62 @@ impl Service {
     }
 
     fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> {
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
         // How many candidate optimizations we will generate, before evaluating them for readniess: setting
         // this higher than the execution limit gives us a chance to execute some work even if the first
         // few optimizations we find are not ready.
         const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
 
         let mut work = Vec::new();
-
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
-        for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_shard_zero() {
-                // Reset accumulators on the first shard in a tenant
-                schedule_context = ScheduleContext::default();
-                schedule_context.mode = ScheduleMode::Speculative;
-                tenant_shards.clear();
-            }
 
-            if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
-                break;
-            }
-
-            match shard.get_scheduling_policy() {
-                ShardSchedulingPolicy::Active => {
-                    // Ok to do optimization
+        for (_tenant_id, schedule_context, shards) in
+            TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
+        {
+            for shard in shards {
+                if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
+                    break;
                 }
-                ShardSchedulingPolicy::Essential
-                | ShardSchedulingPolicy::Pause
-                | ShardSchedulingPolicy::Stop => {
-                    // Policy prevents optimizing this shard.
-                    continue;
+                match shard.get_scheduling_policy() {
+                    ShardSchedulingPolicy::Active => {
+                        // Ok to do optimization
+                    }
+                    ShardSchedulingPolicy::Essential
+                    | ShardSchedulingPolicy::Pause
+                    | ShardSchedulingPolicy::Stop => {
+                        // Policy prevents optimizing this shard.
+                        continue;
+                    }
                 }
-            }
 
-            // Accumulate the schedule context for all the shards in a tenant: we must have
-            // the total view of all shards before we can try to optimize any of them.
-            schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
-            tenant_shards.push(shard);
-
-            // Once we have seen the last shard in the tenant, proceed to search across all shards
-            // in the tenant for optimizations
-            if shard.shard.number.0 == shard.shard.count.count() - 1 {
-                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
+                if !matches!(shard.splitting, SplitState::Idle)
+                    || matches!(shard.policy, PlacementPolicy::Detached)
+                    || shard.reconciler.is_some()
+                {
                     // Do not start any optimizations while another change to the tenant is ongoing: this
                     // is not necessary for correctness, but simplifies operations and implicitly throttles
                     // optimization changes to happen in a "trickle" over time.
                     continue;
                 }
 
-                if tenant_shards.iter().any(|s| {
-                    !matches!(s.splitting, SplitState::Idle)
-                        || matches!(s.policy, PlacementPolicy::Detached)
-                }) {
-                    // Never attempt to optimize a tenant that is currently being split, or
-                    // a tenant that is meant to be detached
-                    continue;
-                }
-
                 // TODO: optimization calculations are relatively expensive: create some fast-path for
                 // the common idle case (avoiding the search on tenants that we have recently checked)
-
-                for shard in &tenant_shards {
-                    if let Some(optimization) =
-                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
-                        // its primary location based on soft constraints, cut it over.
-                        shard.optimize_attachment(nodes, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    } else if let Some(optimization) =
-                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
-                        // better placed on another node, based on ScheduleContext, then adjust it.  This
-                        // covers cases like after a shard split, where we might have too many shards
-                        // in the same tenant with secondary locations on the node where they originally split.
-                        shard.optimize_secondary(scheduler, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    }
-
-                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
-                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
-                    // for the total number of attachments on a node (not just within a tenant.)
+                if let Some(optimization) =
+                    // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                    // its primary location based on soft constraints, cut it over.
+                    shard.optimize_attachment(nodes, &schedule_context)
+                {
+                    work.push((shard.tenant_shard_id, optimization));
+                    break;
+                } else if let Some(optimization) =
+                    // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
+                    // better placed on another node, based on ScheduleContext, then adjust it.  This
+                    // covers cases like after a shard split, where we might have too many shards
+                    // in the same tenant with secondary locations on the node where they originally split.
+                    shard.optimize_secondary(scheduler, &schedule_context)
+                {
+                    work.push((shard.tenant_shard_id, optimization));
+                    break;
                 }
             }
         }
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
new file mode 100644
index 0000000000..d38010a27e
--- /dev/null
+++ b/storage_controller/src/service/context_iterator.rs
@@ -0,0 +1,139 @@
+use std::collections::BTreeMap;
+
+use utils::id::TenantId;
+use utils::shard::TenantShardId;
+
+use crate::scheduler::{ScheduleContext, ScheduleMode};
+use crate::tenant_shard::TenantShard;
+
+/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
+/// tenant while considering the individual shards within it.  This iterator is a helper
+/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
+/// for the tenant.
+pub(super) struct TenantShardContextIterator<'a> {
+    schedule_mode: ScheduleMode,
+    inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
+}
+
+impl<'a> TenantShardContextIterator<'a> {
+    pub(super) fn new(
+        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
+        schedule_mode: ScheduleMode,
+    ) -> Self {
+        Self {
+            schedule_mode,
+            inner: tenants.iter_mut(),
+        }
+    }
+}
+
+impl<'a> Iterator for TenantShardContextIterator<'a> {
+    type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut tenant_shards = Vec::new();
+        let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone());
+        loop {
+            let (tenant_shard_id, shard) = self.inner.next()?;
+
+            if tenant_shard_id.is_shard_zero() {
+                // Cleared on last shard of previous tenant
+                assert!(tenant_shards.is_empty());
+            }
+
+            // Accumulate the schedule context for all the shards in a tenant
+            schedule_context.avoid(&shard.intent.all_pageservers());
+            if let Some(attached) = shard.intent.get_attached() {
+                schedule_context.push_attached(*attached);
+            }
+            tenant_shards.push(shard);
+
+            if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
+                return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::BTreeMap, str::FromStr};
+
+    use pageserver_api::controller_api::PlacementPolicy;
+    use utils::shard::{ShardCount, ShardNumber};
+
+    use crate::{
+        scheduler::test_utils::make_test_nodes, service::Scheduler,
+        tenant_shard::tests::make_test_tenant_with_id,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_context_iterator() {
+        // Hand-crafted tenant IDs to ensure they appear in the expected order when put into
+        // a btreemap & iterated
+        let mut t_1_shards = make_test_tenant_with_id(
+            TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(1),
+            None,
+        );
+        let t_2_shards = make_test_tenant_with_id(
+            TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(4),
+            None,
+        );
+        let mut t_3_shards = make_test_tenant_with_id(
+            TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(1),
+            None,
+        );
+
+        let t1_id = t_1_shards[0].tenant_shard_id.tenant_id;
+        let t2_id = t_2_shards[0].tenant_shard_id.tenant_id;
+        let t3_id = t_3_shards[0].tenant_shard_id.tenant_id;
+
+        let mut tenants = BTreeMap::new();
+        tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap());
+        for shard in t_2_shards {
+            tenants.insert(shard.tenant_shard_id, shard);
+        }
+        tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap());
+
+        let nodes = make_test_nodes(3, &[]);
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
+        for shard in tenants.values_mut() {
+            shard.schedule(&mut scheduler, &mut context).unwrap();
+        }
+
+        let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t1_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards.len(), 1);
+        assert_eq!(context.attach_count(), 1);
+
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t2_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1));
+        assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
+        assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
+        assert_eq!(shards.len(), 4);
+        assert_eq!(context.attach_count(), 4);
+
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t3_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards.len(), 1);
+        assert_eq!(context.attach_count(), 1);
+
+        for shard in tenants.values_mut() {
+            shard.intent.clear(&mut scheduler);
+        }
+    }
+}
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 27c97d3b86..2eb98ee825 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1574,13 +1574,20 @@ pub(crate) mod tests {
         )
     }
 
-    fn make_test_tenant(
+    pub(crate) fn make_test_tenant(
         policy: PlacementPolicy,
         shard_count: ShardCount,
         preferred_az: Option<AvailabilityZone>,
     ) -> Vec<TenantShard> {
-        let tenant_id = TenantId::generate();
+        make_test_tenant_with_id(TenantId::generate(), policy, shard_count, preferred_az)
+    }
 
+    pub(crate) fn make_test_tenant_with_id(
+        tenant_id: TenantId,
+        policy: PlacementPolicy,
+        shard_count: ShardCount,
+        preferred_az: Option<AvailabilityZone>,
+    ) -> Vec<TenantShard> {
         (0..shard_count.count())
             .map(|i| {
                 let shard_number = ShardNumber(i);

From a6073b5013fb1513e1f9937642fb3610f62854dc Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 29 Nov 2024 15:38:04 +0200
Subject: [PATCH 2/6] safekeeper: use jemalloc (#9780)

## Problem

To add Safekeeper heap profiling in #9778, we need to switch to an
allocator that supports it. Pageserver and proxy already use jemalloc.

Touches #9534.

## Summary of changes

Use jemalloc in Safekeeper.
---
 Cargo.lock                        |  1 +
 safekeeper/Cargo.toml             |  1 +
 safekeeper/benches/receive_wal.rs | 30 +++++++++++++++++++++++++++++-
 safekeeper/src/bin/safekeeper.rs  |  3 +++
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index f05c6311dd..abe69525c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5409,6 +5409,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
  "tokio-postgres",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 635a9222e1..0422c46ab1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,7 @@ serde_json.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index c637b4fb24..8c4281cf52 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -6,6 +6,7 @@ mod benchutils;
 use std::io::Write as _;
 
 use benchutils::Env;
+use bytes::BytesMut;
 use camino_tempfile::tempfile;
 use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
 use itertools::Itertools as _;
@@ -23,6 +24,9 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
@@ -30,7 +34,8 @@ criterion_group!(
     targets = bench_process_msg,
     bench_wal_acceptor,
     bench_wal_acceptor_throughput,
-    bench_file_write
+    bench_file_write,
+    bench_bytes_reserve,
 );
 criterion_main!(benches);
 
@@ -341,3 +346,26 @@ fn bench_file_write(c: &mut Criterion) {
         Ok(())
     }
 }
+
+/// Benchmarks the cost of memory allocations when receiving WAL messages. This emulates the logic
+/// in FeMessage::parse, which extends the read buffer. It is primarily intended to test jemalloc.
+fn bench_bytes_reserve(c: &mut Criterion) {
+    let mut g = c.benchmark_group("bytes_reserve");
+    for size in [1, 64, KB, 8 * KB, 128 * KB] {
+        g.throughput(criterion::Throughput::Bytes(size as u64));
+        g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap());
+    }
+
+    fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> {
+        let mut bytes = BytesMut::new();
+        let data = vec![0; size];
+
+        b.iter(|| {
+            bytes.reserve(size);
+            bytes.extend_from_slice(&data);
+            bytes.split_to(size).freeze();
+        });
+
+        Ok(())
+    }
+}
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1248428d33..3659bcd7e0 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -48,6 +48,9 @@ use utils::{
     tcp_listener,
 };
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
 

From 538e2312a617c65d489d391892c70b2e4d7407b5 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 29 Nov 2024 14:55:56 +0100
Subject: [PATCH 3/6] feat(compute_ctl): Always set application_name (#9934)

## Problem

It was not always possible to judge what exactly some `cloud_admin`
connections were doing because we didn't consistently set
`application_name` everywhere.

## Summary of changes

Unify the way we connect to Postgres:
1. Switch to building configs everywhere
2. Always set `application_name` and make naming consistent

Follow-up for #9919
Part of neondatabase/cloud#20948
---
 compute_tools/src/bin/compute_ctl.rs      | 10 ++++-
 compute_tools/src/catalog.rs              |  7 +---
 compute_tools/src/checker.rs              |  3 +-
 compute_tools/src/compute.rs              | 49 ++++++++++++++++-------
 compute_tools/src/http/api.rs             | 11 +++--
 compute_tools/src/installed_extensions.rs | 14 +++----
 compute_tools/src/monitor.rs              | 13 +++---
 7 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 6b670de2ea..b178d7abd6 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -37,6 +37,7 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
+use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};
@@ -322,8 +323,15 @@ fn wait_spec(
     } else {
         spec_set = false;
     }
+    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
+        .context("cannot build postgres config from connstr")?;
+    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
+        .context("cannot build tokio postgres config from connstr")?;
     let compute_node = ComputeNode {
-        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
+        connstr,
+        conn_conf,
+        tokio_conn_conf,
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
         pgversion: get_pg_version_string(pgbin),
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 08ae8bf44d..72198a9479 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -6,7 +6,6 @@ use tokio::{
     process::Command,
     spawn,
 };
-use tokio_postgres::connect;
 use tokio_stream::{self as stream, StreamExt};
 use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
@@ -16,10 +15,8 @@ use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgr
 use compute_api::responses::CatalogObjects;
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
-    let connstr = compute.connstr.clone();
-
-    let (client, connection): (tokio_postgres::Client, _) =
-        connect(connstr.as_str(), NoTls).await?;
+    let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles"));
+    let (client, connection): (tokio_postgres::Client, _) = conf.connect(NoTls).await?;
 
     spawn(async move {
         if let Err(e) = connection.await {
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index cec2b1bed8..62d61a8bc9 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -9,7 +9,8 @@ use crate::compute::ComputeNode;
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
     // Connect to the database.
-    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
+    let conf = compute.get_tokio_conn_conf(Some("compute_ctl:availability_checker"));
+    let (client, connection) = conf.connect(NoTls).await?;
     if client.is_closed() {
         return Err(anyhow!("connection to postgres closed"));
     }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1a026a4014..da1caf1a9b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,8 +20,9 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use nix::unistd::Pid;
+use postgres;
 use postgres::error::SqlState;
-use postgres::{Client, NoTls};
+use postgres::NoTls;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -58,6 +59,10 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);
 pub struct ComputeNode {
     // Url type maintains proper escaping
     pub connstr: url::Url,
+    // We connect to Postgres from many different places, so build configs once
+    // and reuse them where needed.
+    pub conn_conf: postgres::config::Config,
+    pub tokio_conn_conf: tokio_postgres::config::Config,
     pub pgdata: String,
     pub pgbin: String,
     pub pgversion: String,
@@ -800,10 +805,10 @@ impl ComputeNode {
     /// version. In the future, it may upgrade all 3rd-party extensions.
     #[instrument(skip_all)]
     pub fn post_apply_config(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
+        let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config"));
         thread::spawn(move || {
             let func = || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                let mut client = conf.connect(NoTls)?;
                 handle_neon_extension_upgrade(&mut client)
                     .context("handle_neon_extension_upgrade")?;
                 Ok::<_, anyhow::Error>(())
@@ -815,12 +820,27 @@ impl ComputeNode {
         Ok(())
     }
 
+    pub fn get_conn_conf(&self, application_name: Option<&str>) -> postgres::Config {
+        let mut conf = self.conn_conf.clone();
+        if let Some(application_name) = application_name {
+            conf.application_name(application_name);
+        }
+        conf
+    }
+
+    pub fn get_tokio_conn_conf(&self, application_name: Option<&str>) -> tokio_postgres::Config {
+        let mut conf = self.tokio_conn_conf.clone();
+        if let Some(application_name) = application_name {
+            conf.application_name(application_name);
+        }
+        conf
+    }
+
     async fn get_maintenance_client(
         conf: &tokio_postgres::Config,
     ) -> Result<tokio_postgres::Client> {
         let mut conf = conf.clone();
-
-        conf.application_name("apply_config");
+        conf.application_name("compute_ctl:apply_config");
 
         let (client, conn) = match conf.connect(NoTls).await {
             // If connection fails, it may be the old node with `zenith_admin` superuser.
@@ -837,6 +857,7 @@ impl ComputeNode {
                         e
                     );
                     let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
+                    zenith_admin_conf.application_name("compute_ctl:apply_config");
                     zenith_admin_conf.user("zenith_admin");
 
                     let mut client =
@@ -1134,8 +1155,7 @@ impl ComputeNode {
     /// Do initial configuration of the already started Postgres.
     #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
-        conf.application_name("apply_config");
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
 
         let conf = Arc::new(conf);
         let spec = Arc::new(
@@ -1161,7 +1181,7 @@ impl ComputeNode {
         thread::spawn(move || {
             let conf = conf.as_ref().clone();
             let mut conf = postgres::config::Config::from(conf);
-            conf.application_name("migrations");
+            conf.application_name("compute_ctl:migrations");
 
             let mut client = conf.connect(NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
@@ -1369,9 +1389,9 @@ impl ComputeNode {
             }
             self.post_apply_config()?;
 
-            let connstr = self.connstr.clone();
+            let conf = self.get_conn_conf(None);
             thread::spawn(move || {
-                let res = get_installed_extensions(&connstr);
+                let res = get_installed_extensions(conf);
                 match res {
                     Ok(extensions) => {
                         info!(
@@ -1510,7 +1530,8 @@ impl ComputeNode {
     /// Select `pg_stat_statements` data and return it as a stringified JSON
     pub async fn collect_insights(&self) -> String {
         let mut result_rows: Vec<String> = Vec::new();
-        let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:collect_insights"));
+        let connect_result = conf.connect(NoTls).await;
         let (client, connection) = connect_result.unwrap();
         tokio::spawn(async move {
             if let Err(e) = connection.await {
@@ -1636,10 +1657,9 @@ LIMIT 100",
         privileges: &[Privilege],
         role_name: &PgIdent,
     ) -> Result<()> {
-        use tokio_postgres::config::Config;
         use tokio_postgres::NoTls;
 
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:set_role_grants"));
         conf.dbname(db_name);
 
         let (db_client, conn) = conf
@@ -1676,10 +1696,9 @@ LIMIT 100",
         db_name: &PgIdent,
         ext_version: ExtVersion,
     ) -> Result<ExtVersion> {
-        use tokio_postgres::config::Config;
         use tokio_postgres::NoTls;
 
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:install_extension"));
         conf.dbname(db_name);
 
         let (db_client, conn) = conf
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index a6c6cff20a..7fa6426d8f 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -295,12 +295,11 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                 return Response::new(Body::from(msg));
             }
 
-            let connstr = compute.connstr.clone();
-            let res = task::spawn_blocking(move || {
-                installed_extensions::get_installed_extensions(&connstr)
-            })
-            .await
-            .unwrap();
+            let conf = compute.get_conn_conf(None);
+            let res =
+                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+                    .await
+                    .unwrap();
 
             match res {
                 Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index f473c29a55..5f62f08858 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -10,8 +10,6 @@ use metrics::core::Collector;
 use metrics::{register_uint_gauge_vec, UIntGaugeVec};
 use once_cell::sync::Lazy;
 
-use crate::pg_helpers::postgres_conf_for_db;
-
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -41,14 +39,16 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 ///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
-pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtensions> {
-    let mut client = Client::connect(connstr.as_str(), NoTls)?;
+pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
+    conf.application_name("compute_ctl:get_installed_extensions");
+    let mut client = conf.connect(NoTls)?;
+
     let databases: Vec<String> = list_dbs(&mut client)?;
 
     let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
     for db in databases.iter() {
-        let config = postgres_conf_for_db(connstr, db)?;
-        let mut db_client = config.connect(NoTls)?;
+        conf.dbname(db);
+        let mut db_client = conf.connect(NoTls)?;
         let extensions: Vec<(String, String)> = db_client
             .query(
                 "SELECT extname, extversion FROM pg_catalog.pg_extension;",
@@ -82,7 +82,7 @@ pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtension
     }
 
     let res = InstalledExtensions {
-        extensions: extensions_map.values().cloned().collect(),
+        extensions: extensions_map.into_values().collect(),
     };
 
     Ok(res)
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index d7127aac32..184f380a8d 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -17,11 +17,8 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let mut connstr = compute.connstr.clone();
-    connstr
-        .query_pairs_mut()
-        .append_pair("application_name", "compute_activity_monitor");
-    let connstr = connstr.as_str();
+    let connstr = compute.connstr.clone();
+    let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
 
     // During startup and configuration we connect to every Postgres database,
     // but we don't want to count this as some user activity. So wait until
@@ -29,7 +26,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
     wait_for_postgres_start(compute);
 
     // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = Client::connect(connstr, NoTls);
+    let mut client = conf.connect(NoTls);
 
     let mut sleep = false;
     let mut prev_active_time: Option<f64> = None;
@@ -57,7 +54,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     info!("connection to Postgres is closed, trying to reconnect");
 
                     // Connection is closed, reconnect and try again.
-                    client = Client::connect(connstr, NoTls);
+                    client = conf.connect(NoTls);
                     continue;
                 }
 
@@ -196,7 +193,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                 debug!("could not connect to Postgres: {}, retrying", e);
 
                 // Establish a new connection and try again.
-                client = Client::connect(connstr, NoTls);
+                client = conf.connect(NoTls);
             }
         }
     }

From d5624cc50521098d16a49ad92a735184a48981ae Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Nov 2024 15:11:44 +0000
Subject: [PATCH 4/6] pageserver: download small objects using a smaller
 timeout (#9938)

## Problem

It appears that the Azure storage API tends to hang TCP connections more
than S3 does.

Currently we use a 2 minute timeout for all downloads. This is large
because sometimes the objects we download are large. However, waiting 2
minutes when doing something like downloading a manifest on tenant
attach is problematic, because when someone is doing a "create tenant,
create timeline" workflow, that 2 minutes is long enough for them
reasonably to give up creating that timeline.

Rather than propagate oversized timeouts further up the stack, we should
use a different timeout for objects that we expect to be small.

Closes: https://github.com/neondatabase/neon/issues/9836

## Summary of changes

- Add a `small_timeout` configuration attribute to remote storage,
defaulting to 30 seconds (still a very generous period to do something
like download an index)
- Add a DownloadKind parameter to DownloadOpts, so that callers can
indicate whether they expect the object to be small or large.
- In the azure client, use small timeout for HEAD requests, and for GET
requests if DownloadKind::Small is used.
- Use DownloadKind::Small for manifests, indices, and heatmap downloads.

This PR intentionally does not make the equivalent change to the S3
client, to reduce blast radius in case this has unexpected consequences
(we could accomplish the same thing by editing lots of configs, but just
skipping the code is simpler for right now)
---
 libs/remote_storage/src/azure_blob.rs         | 23 ++++++++++++++---
 libs/remote_storage/src/config.rs             | 25 ++++++++++++++++---
 libs/remote_storage/src/lib.rs                | 20 ++++++++++++++-
 libs/remote_storage/tests/test_real_azure.rs  |  3 ++-
 libs/remote_storage/tests/test_real_s3.rs     |  1 +
 pageserver/src/deletion_queue.rs              |  1 +
 pageserver/src/tenant.rs                      |  1 +
 .../tenant/remote_timeline_client/download.rs | 21 +++++++++++++---
 pageserver/src/tenant/secondary/downloader.rs |  3 ++-
 .../import_pgdata/importbucket_client.rs      |  4 ++-
 proxy/src/context/parquet.rs                  |  2 ++
 11 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 840917ef68..8d1962fa29 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -35,6 +35,7 @@ use utils::backoff;
 use utils::backoff::exponential_backoff_duration_seconds;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
+use crate::DownloadKind;
 use crate::{
     config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError,
     DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata,
@@ -49,10 +50,17 @@ pub struct AzureBlobStorage {
     concurrency_limiter: ConcurrencyLimiter,
     // Per-request timeout. Accessible for tests.
     pub timeout: Duration,
+
+    // Alternative timeout used for metadata objects which are expected to be small
+    pub small_timeout: Duration,
 }
 
 impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
+    pub fn new(
+        azure_config: &AzureConfig,
+        timeout: Duration,
+        small_timeout: Duration,
+    ) -> Result<Self> {
         debug!(
             "Creating azure remote storage for azure container {}",
             azure_config.container_name
@@ -94,6 +102,7 @@ impl AzureBlobStorage {
             max_keys_per_list_response,
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
             timeout,
+            small_timeout,
         })
     }
 
@@ -133,6 +142,7 @@ impl AzureBlobStorage {
     async fn download_for_builder(
         &self,
         builder: GetBlobBuilder,
+        timeout: Duration,
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         let kind = RequestKind::Get;
@@ -156,7 +166,7 @@ impl AzureBlobStorage {
                 .map_err(to_download_error);
 
             // apply per request timeout
-            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = tokio_stream::StreamExt::timeout(response, timeout);
 
             // flatten
             let response = response.map(|res| match res {
@@ -415,7 +425,7 @@ impl RemoteStorage for AzureBlobStorage {
         let blob_client = self.client.blob_client(self.relative_path_to_name(key));
         let properties_future = blob_client.get_properties().into_future();
 
-        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+        let properties_future = tokio::time::timeout(self.small_timeout, properties_future);
 
         let res = tokio::select! {
             res = properties_future => res,
@@ -521,7 +531,12 @@ impl RemoteStorage for AzureBlobStorage {
             });
         }
 
-        self.download_for_builder(builder, cancel).await
+        let timeout = match opts.kind {
+            DownloadKind::Small => self.small_timeout,
+            DownloadKind::Large => self.timeout,
+        };
+
+        self.download_for_builder(builder, timeout, cancel).await
     }
 
     async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index e99ae4f747..f6ef31077c 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -24,6 +24,13 @@ pub struct RemoteStorageConfig {
         skip_serializing_if = "is_default_timeout"
     )]
     pub timeout: Duration,
+    /// Alternative timeout used for metadata objects which are expected to be small
+    #[serde(
+        with = "humantime_serde",
+        default = "default_small_timeout",
+        skip_serializing_if = "is_default_small_timeout"
+    )]
+    pub small_timeout: Duration,
 }
 
 impl RemoteStorageKind {
@@ -40,10 +47,18 @@ fn default_timeout() -> Duration {
     RemoteStorageConfig::DEFAULT_TIMEOUT
 }
 
+fn default_small_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
+}
+
 fn is_default_timeout(d: &Duration) -> bool {
     *d == RemoteStorageConfig::DEFAULT_TIMEOUT
 }
 
+fn is_default_small_timeout(d: &Duration) -> bool {
+    *d == RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
+}
+
 /// A kind of a remote storage to connect to, with its connection configuration.
 #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
 #[serde(untagged)]
@@ -184,6 +199,7 @@ fn serialize_storage_class<S: serde::Serializer>(
 
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+    pub const DEFAULT_SMALL_TIMEOUT: Duration = std::time::Duration::from_secs(30);
 
     pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
         Ok(utils::toml_edit_ext::deserialize_item(toml)?)
@@ -219,7 +235,8 @@ timeout = '5s'";
                 storage: RemoteStorageKind::LocalFs {
                     local_path: Utf8PathBuf::from(".")
                 },
-                timeout: Duration::from_secs(5)
+                timeout: Duration::from_secs(5),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
@@ -247,7 +264,8 @@ timeout = '5s'";
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                     upload_storage_class: Some(StorageClass::IntelligentTiering),
                 }),
-                timeout: Duration::from_secs(7)
+                timeout: Duration::from_secs(7),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
@@ -299,7 +317,8 @@ timeout = '5s'";
                     concurrency_limit: default_remote_storage_azure_concurrency_limit(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                 }),
-                timeout: Duration::from_secs(7)
+                timeout: Duration::from_secs(7),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 719608dd5f..0ece29d99e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -178,6 +178,15 @@ pub struct DownloadOpts {
     /// The end of the byte range to download, or unbounded. Must be after the
     /// start bound.
     pub byte_end: Bound<u64>,
+    /// Indicate whether we're downloading something small or large: this indirectly controls
+    /// timeouts: for something like an index/manifest/heatmap, we should time out faster than
+    /// for layer files
+    pub kind: DownloadKind,
+}
+
+pub enum DownloadKind {
+    Large,
+    Small,
 }
 
 impl Default for DownloadOpts {
@@ -186,6 +195,7 @@ impl Default for DownloadOpts {
             etag: Default::default(),
             byte_start: Bound::Unbounded,
             byte_end: Bound::Unbounded,
+            kind: DownloadKind::Large,
         }
     }
 }
@@ -584,6 +594,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 impl GenericRemoteStorage {
     pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         let timeout = storage_config.timeout;
+
+        // If somkeone overrides timeout to be small without adjusting small_timeout, then adjust it automatically
+        let small_timeout = std::cmp::min(storage_config.small_timeout, timeout);
+
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs { local_path: path } => {
                 info!("Using fs root '{path}' as a remote storage");
@@ -606,7 +620,11 @@ impl GenericRemoteStorage {
                     .unwrap_or("<AZURE_STORAGE_ACCOUNT>");
                 info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                       azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(
+                    azure_config,
+                    timeout,
+                    small_timeout,
+                )?))
             }
         })
     }
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 3a20649490..92d579fec8 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -219,7 +219,8 @@ async fn create_azure_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
-        timeout: Duration::from_secs(120),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config)
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 3e99a65fac..e60ec18c93 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -396,6 +396,7 @@ async fn create_s3_client(
             upload_storage_class: None,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config)
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e74c8ecf5a..1d508f5fe9 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -838,6 +838,7 @@ mod test {
                 local_path: remote_fs_dir.clone(),
             },
             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+            small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
         };
         let storage = GenericRemoteStorage::from_config(&storage_config)
             .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 339a3ca1bb..cd0690bb1a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5423,6 +5423,7 @@ pub(crate) mod harness {
                     local_path: remote_fs_dir.clone(),
                 },
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d632e595ad..739615be9c 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -30,7 +30,9 @@ use crate::tenant::Generation;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath};
+use remote_storage::{
+    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
+};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;
@@ -345,12 +347,13 @@ pub async fn list_remote_timelines(
 async fn do_download_remote_path_retry_forever(
     storage: &GenericRemoteStorage,
     remote_path: &RemotePath,
+    download_opts: DownloadOpts,
     cancel: &CancellationToken,
 ) -> Result<(Vec<u8>, SystemTime), DownloadError> {
     download_retry_forever(
         || async {
             let download = storage
-                .download(remote_path, &DownloadOpts::default(), cancel)
+                .download(remote_path, &download_opts, cancel)
                 .await?;
 
             let mut bytes = Vec::new();
@@ -377,8 +380,13 @@ async fn do_download_tenant_manifest(
 ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
     let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
 
+    let download_opts = DownloadOpts {
+        kind: DownloadKind::Small,
+        ..Default::default()
+    };
+
     let (manifest_bytes, manifest_bytes_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+        do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?;
 
     let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
         .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
@@ -398,8 +406,13 @@ async fn do_download_index_part(
         timeline_id.expect("A timeline ID is always provided when downloading an index");
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
+    let download_opts = DownloadOpts {
+        kind: DownloadKind::Small,
+        ..Default::default()
+    };
+
     let (index_part_bytes, index_part_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+        do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 7443261a9c..8d771dc405 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -49,7 +49,7 @@ use futures::Future;
 use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -946,6 +946,7 @@ impl<'a> TenantDownloader<'a> {
         let cancel = &self.secondary_state.cancel;
         let opts = DownloadOpts {
             etag: prev_etag.cloned(),
+            kind: DownloadKind::Small,
             ..Default::default()
         };
 
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index 8d5ab1780f..bc4d148a29 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -4,7 +4,8 @@ use anyhow::Context;
 use bytes::Bytes;
 use postgres_ffi::ControlFileData;
 use remote_storage::{
-    Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
+    Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
+    ListingObject, RemotePath,
 };
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
@@ -239,6 +240,7 @@ impl RemoteStorageWrapper {
                     .download(
                         path,
                         &DownloadOpts {
+                            kind: DownloadKind::Large,
                             etag: None,
                             byte_start: Bound::Included(start_inclusive),
                             byte_end: Bound::Excluded(end_exclusive)
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e328c6de79..b375eb886e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -486,6 +486,7 @@ mod tests {
                     upload_storage_class: None,
                 }),
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
             })
         );
         assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
@@ -545,6 +546,7 @@ mod tests {
                 local_path: tmpdir.to_path_buf(),
             },
             timeout: std::time::Duration::from_secs(120),
+            small_timeout: std::time::Duration::from_secs(30),
         };
         let storage = GenericRemoteStorage::from_config(&remote_storage_config)
             .await

From c848f25ec25e04afba9f2b0509372504b35cafe9 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:58:36 +0000
Subject: [PATCH 5/6] Fixed fast_import pgbin in calling get_pg_version (#9933)

Was working on https://github.com/neondatabase/cloud/pull/20795 and
discovered that fast_import is not working normally.
---
 compute_tools/src/bin/fast_import.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 6716cc6234..b6db3eb11a 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -21,7 +21,7 @@
 //! - Build the image with the following command:
 //!
 //! ```bash
-//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)"  -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com
+//! docker buildx build --platform linux/amd64 --build-arg DEBIAN_VERSION=bullseye --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/compute-node.Dockerfile .
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```
 
@@ -132,7 +132,8 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     //  Initialize pgdata
     //
-    let pg_version = match get_pg_version(pg_bin_dir.as_str()) {
+    let pgbin = pg_bin_dir.join("postgres");
+    let pg_version = match get_pg_version(pgbin.as_ref()) {
         PostgresMajorVersion::V14 => 14,
         PostgresMajorVersion::V15 => 15,
         PostgresMajorVersion::V16 => 16,
@@ -155,7 +156,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     // Launch postgres process
     //
-    let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres"))
+    let mut postgres_proc = tokio::process::Command::new(pgbin)
         .arg("-D")
         .arg(&pgdata_dir)
         .args(["-c", "wal_level=minimal"])

From 973a8d2680f968e83e5668e69c87636189146e54 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 29 Nov 2024 20:10:26 +0100
Subject: [PATCH 6/6] Fix timeout value used in XLogWaitForReplayOf (#9937)

The previous value assumed usec precision, while the timeout used is in
milliseconds, causing replica backends to wait for (potentially) many
hours for WAL replay without the expected progress reports in logs.

This fixes the issue.

Reported-By: Alexander Lakhin <exclusion@gmail.com>

## Problem


https://github.com/neondatabase/postgres/pull/279#issuecomment-2507671817

The timeout value was configured with the assumption the indicated value
would be microseconds, where it's actually milliseconds. That causes the
backend to wait for much longer (2h46m40s) before it emits the "I'm
waiting for recovery" message. While we do have wait events configured
on this, it's not great to have stuck backends without clear logs, so
this fixes the timeout value in all our PostgreSQL branches.

## PG PRs

* PG14: https://github.com/neondatabase/postgres/pull/542
* PG15: https://github.com/neondatabase/postgres/pull/543
* PG16: https://github.com/neondatabase/postgres/pull/544
* PG17: https://github.com/neondatabase/postgres/pull/545
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 284ae56be2..c1989c934d 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5
+Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index aed79ee87b..d929b9a8b9 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05
+Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f5cfc6fa89..13e9e35394 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff
+Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 3c15b6565f..faebe5e5af 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f
+Subproject commit faebe5e5aff5687908504453623778f8515529db
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 4dae88e73d..abeddcadf7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f"
+    "faebe5e5aff5687908504453623778f8515529db"
   ],
   "v16": [
     "16.6",
-    "f5cfc6fa898544050e821ac688adafece1ac3cff"
+    "13e9e3539419003e79bd9aa29e1bc44f3fd555dd"
   ],
   "v15": [
     "15.10",
-    "aed79ee87b94779cc52ec13e3b74eba6ada93f05"
+    "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6"
   ],
   "v14": [
     "14.15",
-    "284ae56be2397fd3eaf20777fa220b2d0ad968f5"
+    "c1989c934d46e04e78b3c496c8a34bcd40ddceeb"
   ]
 }