From bea8efac242e32ad0850153f5199e8d15b5fd91c Mon Sep 17 00:00:00 2001
From: Richy Wang <wangxuqi@users.noreply.github.com>
Date: Tue, 7 Nov 2023 16:13:01 +0800
Subject: [PATCH 01/36] Fix comments in 'receive_wal.rs'. (#5807)

## Problem
Some comments in 'receive_wal.rs' is not suitable. It may copy from
'send_wal.rs' and leave it unchanged.
## Summary of changes
This commit fixes two comments in the code:
Changed "/// Unregister walsender." to "/// Unregister walreceiver."
Changed "///Scope guard to access slot in WalSenders registry" to
"///Scope guard to access slot in WalReceivers registry."
---
 safekeeper/src/receive_wal.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 934dda10b2..9ce9b049ba 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -111,7 +111,7 @@ impl WalReceivers {
             .count()
     }
 
-    /// Unregister walsender.
+    /// Unregister walreceiver.
     fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
     Streaming,
 }
 
-/// Scope guard to access slot in WalSenders registry and unregister from it in
-/// Drop.
+/// Scope guard to access slot in WalReceivers registry and unregister from
+/// it in Drop.
 pub struct WalReceiverGuard {
     id: WalReceiverId,
     walreceivers: Arc<WalReceivers>,

From c00651ff9bf7941aa617f6185ee9f25f74555d8a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 7 Nov 2023 09:06:53 +0000
Subject: [PATCH 02/36] pageserver: start refactoring into TenantManager
 (#5797)

## Problem

See: https://github.com/neondatabase/neon/issues/5796

## Summary of changes

Completing the refactor is quite verbose and can be done in stages: each
interface that is currently called directly from a top-level mgr.rs
function can be moved into TenantManager once the relevant subsystems
have access to it.

Landing the initial change to create of TenantManager is useful because
it enables new code to use it without having to be altered later, and
sets us up to incrementally fix the existing code to use an explicit
Arc<TenantManager> instead of relying on the static TENANTS.
---
 pageserver/src/bin/pageserver.rs |   4 +-
 pageserver/src/http/routes.rs    |  29 ++--
 pageserver/src/tenant/mgr.rs     | 258 ++++++++++++++++---------------
 3 files changed, 153 insertions(+), 138 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index df2db25ec6..93ae43a14b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -410,7 +410,7 @@ fn start_pageserver(
 
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         TenantSharedResources {
             broker_client: broker_client.clone(),
@@ -420,6 +420,7 @@ fn start_pageserver(
         order,
         shutdown_pageserver.clone(),
     ))?;
+    let tenant_manager = Arc::new(tenant_manager);
 
     BACKGROUND_RUNTIME.spawn({
         let init_done_rx = init_done_rx;
@@ -548,6 +549,7 @@ fn start_pageserver(
         let router_state = Arc::new(
             http::routes::State::new(
                 conf,
+                tenant_manager,
                 http_auth.clone(),
                 remote_storage.clone(),
                 broker_client.clone(),
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 820b8eae46..a0643bc580 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -35,8 +35,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -63,6 +63,7 @@ use super::models::ConfigureFailpointsRequest;
 
 pub struct State {
     conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<JwtAuth>>,
     allowlist_routes: Vec<Uri>,
     remote_storage: Option<GenericRemoteStorage>,
@@ -74,6 +75,7 @@ pub struct State {
 impl State {
     pub fn new(
         conf: &'static PageServerConf,
+        tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<JwtAuth>>,
         remote_storage: Option<GenericRemoteStorage>,
         broker_client: storage_broker::BrokerClientChannel,
@@ -86,6 +88,7 @@ impl State {
             .collect::<Vec<_>>();
         Ok(Self {
             conf,
+            tenant_manager,
             auth,
             allowlist_routes,
             remote_storage,
@@ -1140,20 +1143,14 @@ async fn put_tenant_location_config_handler(
     let location_conf =
         LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
+    state
+        .tenant_manager
+        .upsert_location(tenant_id, location_conf, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;
 
     json_response(StatusCode::OK, ())
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 576bcea2ce..664181e40d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -200,6 +200,22 @@ async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
     Ok(())
 }
 
+/// The TenantManager is responsible for storing and mutating the collection of all tenants
+/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
+/// lives inside the TenantManager.
+///
+/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
+/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
+/// and attached modes concurrently.
+pub struct TenantManager {
+    conf: &'static PageServerConf,
+    // TODO: currently this is a &'static pointing to TENANTs.  When we finish refactoring
+    // out of that static variable, the TenantManager can own this.
+    // See https://github.com/neondatabase/neon/issues/5796
+    tenants: &'static std::sync::RwLock<TenantsMap>,
+    resources: TenantSharedResources,
+}
+
 fn emergency_generations(
     tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantId, Generation> {
@@ -366,7 +382,7 @@ pub async fn init_tenant_mgr(
     resources: TenantSharedResources,
     init_order: InitializationOrder,
     cancel: CancellationToken,
-) -> anyhow::Result<()> {
+) -> anyhow::Result<TenantManager> {
     let mut tenants = HashMap::new();
 
     let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
@@ -468,7 +484,12 @@ pub async fn init_tenant_mgr(
     assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
     METRICS.tenant_slots.set(tenants.len() as u64);
     *tenants_map = TenantsMap::Open(tenants);
-    Ok(())
+
+    Ok(TenantManager {
+        conf,
+        tenants: &TENANTS,
+        resources,
+    })
 }
 
 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
@@ -742,139 +763,134 @@ pub(crate) async fn set_new_tenant_config(
     Ok(())
 }
 
-#[instrument(skip_all, fields(%tenant_id))]
-pub(crate) async fn upsert_location(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    new_location_config: LocationConf,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue_client: DeletionQueueClient,
-    ctx: &RequestContext,
-) -> Result<(), anyhow::Error> {
-    info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
+impl TenantManager {
+    #[instrument(skip_all, fields(%tenant_id))]
+    pub(crate) async fn upsert_location(
+        &self,
+        tenant_id: TenantId,
+        new_location_config: LocationConf,
+        ctx: &RequestContext,
+    ) -> Result<(), anyhow::Error> {
+        info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
 
-    // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
-    // then we do not need to set the slot to InProgress, we can just call into the
-    // existng tenant.
-    {
-        let locked = TENANTS.read().unwrap();
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
-        match (&new_location_config.mode, peek_slot) {
-            (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                if attach_conf.generation == tenant.generation {
-                    // A transition from Attached to Attached in the same generation, we may
-                    // take our fast path and just provide the updated configuration
-                    // to the tenant.
-                    tenant.set_new_location_config(AttachedTenantConf::try_from(
-                        new_location_config,
-                    )?);
+        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        // then we do not need to set the slot to InProgress, we can just call into the
+        // existng tenant.
+        {
+            let locked = TENANTS.read().unwrap();
+            let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
+            match (&new_location_config.mode, peek_slot) {
+                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
+                    if attach_conf.generation == tenant.generation {
+                        // A transition from Attached to Attached in the same generation, we may
+                        // take our fast path and just provide the updated configuration
+                        // to the tenant.
+                        tenant.set_new_location_config(AttachedTenantConf::try_from(
+                            new_location_config,
+                        )?);
 
-                    // Persist the new config in the background, to avoid holding up any
-                    // locks while we do so.
-                    // TODO
+                        // Persist the new config in the background, to avoid holding up any
+                        // locks while we do so.
+                        // TODO
 
-                    return Ok(());
-                } else {
-                    // Different generations, fall through to general case
+                        return Ok(());
+                    } else {
+                        // Different generations, fall through to general case
+                    }
+                }
+                _ => {
+                    // Not an Attached->Attached transition, fall through to general case
                 }
             }
-            _ => {
-                // Not an Attached->Attached transition, fall through to general case
-            }
         }
-    }
 
-    // General case for upserts to TenantsMap, excluding the case above: we will substitute an
-    // InProgress value to the slot while we make whatever changes are required.  The state for
-    // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
-    // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
-    // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-    let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
+        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
+        // InProgress value to the slot while we make whatever changes are required.  The state for
+        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
+        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
+        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
 
-    if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-        // The case where we keep a Tenant alive was covered above in the special case
-        // for Attached->Attached transitions in the same generation.  By this point,
-        // if we see an attached tenant we know it will be discarded and should be
-        // shut down.
-        let (_guard, progress) = utils::completion::channel();
+        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
+            // The case where we keep a Tenant alive was covered above in the special case
+            // for Attached->Attached transitions in the same generation.  By this point,
+            // if we see an attached tenant we know it will be discarded and should be
+            // shut down.
+            let (_guard, progress) = utils::completion::channel();
 
-        match tenant.get_attach_mode() {
-            AttachmentMode::Single | AttachmentMode::Multi => {
-                // Before we leave our state as the presumed holder of the latest generation,
-                // flush any outstanding deletions to reduce the risk of leaking objects.
-                deletion_queue_client.flush_advisory()
+            match tenant.get_attach_mode() {
+                AttachmentMode::Single | AttachmentMode::Multi => {
+                    // Before we leave our state as the presumed holder of the latest generation,
+                    // flush any outstanding deletions to reduce the risk of leaking objects.
+                    self.resources.deletion_queue_client.flush_advisory()
+                }
+                AttachmentMode::Stale => {
+                    // If we're stale there's not point trying to flush deletions
+                }
+            };
+
+            info!("Shutting down attached tenant");
+            match tenant.shutdown(progress, false).await {
+                Ok(()) => {}
+                Err(barrier) => {
+                    info!("Shutdown already in progress, waiting for it to complete");
+                    barrier.wait().await;
+                }
             }
-            AttachmentMode::Stale => {
-                // If we're stale there's not point trying to flush deletions
+            slot_guard.drop_old_value().expect("We just shut it down");
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_id);
+
+        let new_slot = match &new_location_config.mode {
+            LocationMode::Secondary(_) => {
+                let tenant_path = self.conf.tenant_path(&tenant_id);
+                // Directory doesn't need to be fsync'd because if we crash it can
+                // safely be recreated next time this tenant location is configured.
+                unsafe_create_dir_all(&tenant_path)
+                    .await
+                    .with_context(|| format!("Creating {tenant_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                TenantSlot::Secondary
+            }
+            LocationMode::Attached(_attach_config) => {
+                let timelines_path = self.conf.timelines_path(&tenant_id);
+
+                // Directory doesn't need to be fsync'd because we do not depend on
+                // it to exist after crashes: it may be recreated when tenant is
+                // re-attached, see https://github.com/neondatabase/neon/issues/5550
+                unsafe_create_dir_all(&timelines_path)
+                    .await
+                    .with_context(|| format!("Creating {timelines_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                let tenant = tenant_spawn(
+                    self.conf,
+                    tenant_id,
+                    &tenant_path,
+                    self.resources.clone(),
+                    AttachedTenantConf::try_from(new_location_config)?,
+                    None,
+                    self.tenants,
+                    SpawnMode::Normal,
+                    ctx,
+                )?;
+
+                TenantSlot::Attached(tenant)
             }
         };
 
-        info!("Shutting down attached tenant");
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {}
-            Err(barrier) => {
-                info!("Shutdown already in progress, waiting for it to complete");
-                barrier.wait().await;
-            }
-        }
-        slot_guard.drop_old_value().expect("We just shut it down");
+        slot_guard.upsert(new_slot)?;
+
+        Ok(())
     }
-
-    let tenant_path = conf.tenant_path(&tenant_id);
-
-    let new_slot = match &new_location_config.mode {
-        LocationMode::Secondary(_) => {
-            let tenant_path = conf.tenant_path(&tenant_id);
-            // Directory doesn't need to be fsync'd because if we crash it can
-            // safely be recreated next time this tenant location is configured.
-            unsafe_create_dir_all(&tenant_path)
-                .await
-                .with_context(|| format!("Creating {tenant_path}"))?;
-
-            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
-
-            TenantSlot::Secondary
-        }
-        LocationMode::Attached(_attach_config) => {
-            let timelines_path = conf.timelines_path(&tenant_id);
-
-            // Directory doesn't need to be fsync'd because we do not depend on
-            // it to exist after crashes: it may be recreated when tenant is
-            // re-attached, see https://github.com/neondatabase/neon/issues/5550
-            unsafe_create_dir_all(&timelines_path)
-                .await
-                .with_context(|| format!("Creating {timelines_path}"))?;
-
-            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
-
-            let tenant = tenant_spawn(
-                conf,
-                tenant_id,
-                &tenant_path,
-                TenantSharedResources {
-                    broker_client,
-                    remote_storage,
-                    deletion_queue_client,
-                },
-                AttachedTenantConf::try_from(new_location_config)?,
-                None,
-                &TENANTS,
-                SpawnMode::Normal,
-                ctx,
-            )?;
-
-            TenantSlot::Attached(tenant)
-        }
-    };
-
-    slot_guard.upsert(new_slot)?;
-
-    Ok(())
 }
 
 #[derive(Debug, thiserror::Error)]

From a394f49e0dba004884dd1f7c7bc7e5888a3e5fb8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 7 Nov 2023 09:35:45 +0000
Subject: [PATCH 03/36] pageserver: avoid converting an error to anyhow::Error
 (#5803)

This was preventing it getting cleanly converted to a
CalculateLogicalSizeError::Cancelled, resulting in "Logical size
calculation failed" errors in logs.
---
 pageserver/src/pgdatadir_mapping.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 88974588d4..aa4d155bcc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -589,11 +589,7 @@ impl Timeline {
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, lsn, ctx)
-                .await
-                .context("list rels")?
-            {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
                 if cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }

From 4be6bc7251ebe857fada6c94d8e7dbe31da2f839 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 7 Nov 2023 12:26:25 +0200
Subject: [PATCH 04/36] refactor: remove unnecessary unsafe (#5802)

unsafe impls for `Send` and `Sync` should not be added by default. in
the case of `SlotGuard` removing them does not cause any issues, as the
compiler automatically derives those.

This PR adds requirement to document the unsafety (see
[clippy::undocumented_unsafe_blocks]) and opportunistically adds
`#![deny(unsafe_code)]` to most places where we don't have unsafe code
right now.

TRPL on Send and Sync:
https://doc.rust-lang.org/book/ch16-04-extensible-concurrency-sync-and-send.html

[clippy::undocumented_unsafe_blocks]:
https://rust-lang.github.io/rust-clippy/master/#/undocumented_unsafe_blocks
---
 compute_tools/src/lib.rs                |  4 ++--
 control_plane/src/background_process.rs |  2 +-
 control_plane/src/lib.rs                | 15 +++++++-------
 libs/compute_api/src/lib.rs             |  2 ++
 libs/consumption_metrics/src/lib.rs     |  4 ++--
 libs/metrics/src/lib.rs                 |  1 +
 libs/pageserver_api/src/lib.rs          |  2 ++
 libs/postgres_backend/src/lib.rs        |  2 ++
 libs/postgres_connection/src/lib.rs     |  2 ++
 libs/postgres_ffi/src/lib.rs            |  2 ++
 libs/pq_proto/src/lib.rs                |  1 +
 libs/remote_storage/src/lib.rs          |  2 ++
 libs/safekeeper_api/src/lib.rs          |  2 ++
 libs/tenant_size_model/src/lib.rs       |  2 ++
 libs/tracing-utils/src/lib.rs           |  2 ++
 libs/utils/src/id.rs                    |  2 ++
 libs/utils/src/lib.rs                   |  1 +
 libs/utils/src/shutdown.rs              |  3 ++-
 libs/vm_monitor/src/lib.rs              |  2 ++
 pageserver/src/lib.rs                   |  2 ++
 pageserver/src/tenant/mgr.rs            |  3 ---
 pageserver/src/walredo.rs               | 26 ++++++++++++-------------
 proxy/src/lib.rs                        |  2 ++
 s3_scrubber/src/lib.rs                  |  2 ++
 safekeeper/src/lib.rs                   |  1 +
 25 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 1cd9602089..4e01ffd954 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index c0016ece17..26fc08fc8f 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
     P: Into<Utf8PathBuf>,
 {
     let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
     // pre_exec is marked unsafe because it runs between fork and exec.
     // Why is that dangerous in various ways?
     // Long answer:  https://github.com/rust-lang/rust/issues/39575
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index 7592880402..bb79d36bfc 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 pub mod attachment_service;
 mod background_process;
diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs
index b660799ec0..210a52d089 100644
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 9e89327e84..810196aff6 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 0b3b24c18e..ed375a152f 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index d844021785..e49f7d00c1 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 
 /// Public API types
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 455fe7a481..1e25c6ed6a 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 35344a9168..35cb1a2691 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index d0009d0dc9..d10ebfe277 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
             pub mod bindings {
                 // bindgen generates bindings for a lot of stuff we don't need
                 #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]
 
                 use serde::{Deserialize, Serialize};
                 include!(concat!(
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 94bc7f60f8..41fc206cd7 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 pub mod framed;
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 9b5b340db0..7054610d9e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,6 +6,8 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 mod azure_blob;
 mod local_fs;
diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs
index 0a391478da..63c2c51188 100644
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 
 /// Public API types
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index c151e3b42c..a3e12cf0e3 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 mod calculation;
 pub mod svg;
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
index de0e2ad799..9cf2495771 100644
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index eef6a358e6..57dcc27719 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,6 +120,8 @@ impl Id {
             chunk[0] = HEX[((b >> 4) & 0xf) as usize];
             chunk[1] = HEX[(b & 0xf) as usize];
         }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
         unsafe { String::from_utf8_unchecked(buf) }
     }
 }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index ff5d774285..bb6c848bf4 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]
 
 pub mod backoff;
 
diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs
index 7eba905997..cb5a44d664 100644
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
     unsafe { nix::libc::_exit(code as _) };
 }
diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs
index a844f78bd6..89ca91fdd7 100644
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]
 
 use anyhow::Context;
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 04c3ae9746..3f74694ef2 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 mod auth;
 pub mod basebackup;
 pub mod config;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 664181e40d..818872c093 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1446,9 +1446,6 @@ pub struct SlotGuard {
     _completion: utils::completion::Completion,
 }
 
-unsafe impl Send for SlotGuard {}
-unsafe impl Sync for SlotGuard {}
-
 impl SlotGuard {
     fn new(
         tenant_id: TenantId,
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index e1d699fcba..38f2c64420 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -596,21 +596,21 @@ trait CloseFileDescriptors: CommandExt {
 
 impl<C: CommandExt> CloseFileDescriptors for C {
     fn close_fds(&mut self) -> &mut Command {
+        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
+        // which means it should be safe to execute inside a signal handler.
+        // The precise meaning depends on platform. See `man signal-safety`
+        // for the linux definition.
+        //
+        // The set_fds_cloexec_threadsafe function is documented to be
+        // async-signal-safe.
+        //
+        // Aside from this function, the rest of the code is re-entrant and
+        // doesn't make any syscalls. We're just passing constants.
+        //
+        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
+        // which is not async-signal-safe. Be careful.
         unsafe {
             self.pre_exec(move || {
-                // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-                // which means it should be safe to execute inside a signal handler.
-                // The precise meaning depends on platform. See `man signal-safety`
-                // for the linux definition.
-                //
-                // The set_fds_cloexec_threadsafe function is documented to be
-                // async-signal-safe.
-                //
-                // Aside from this function, the rest of the code is re-entrant and
-                // doesn't make any syscalls. We're just passing constants.
-                //
-                // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-                // which is not async-signal-safe. Be careful.
                 close_fds::set_fds_cloexec_threadsafe(3, &[]);
                 Ok(())
             })
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 803b278482..4a95e473f6 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 use std::convert::Infallible;
 
 use anyhow::{bail, Context};
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d892438633..777276a4d1 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index aa70cee591..344b0b14e4 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,3 +1,4 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;

From 0ac4cf67a61534dcfbaeaea1b35e5dbac50f2bb8 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Tue, 7 Nov 2023 13:38:02 +0200
Subject: [PATCH 05/36] Use self.tenants instead of TENANTS (#5811)

---
 pageserver/src/tenant/mgr.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 818872c093..27f9d50c54 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -777,7 +777,7 @@ impl TenantManager {
         // then we do not need to set the slot to InProgress, we can just call into the
         // existng tenant.
         {
-            let locked = TENANTS.read().unwrap();
+            let locked = self.tenants.read().unwrap();
             let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
             match (&new_location_config.mode, peek_slot) {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {

From 0141c957887aa9fe0340459d644d92cb9e6b0627 Mon Sep 17 00:00:00 2001
From: Fernando Luz <fluz@users.noreply.github.com>
Date: Tue, 7 Nov 2023 12:13:05 +0000
Subject: [PATCH 06/36] build: Add warning when missing postgres submodule
 during the build (#5614)

I forked the project and in my local repo, I wasn't able to compile the
project and in my search, I found the solution in neon forum. After a PR
discussion, I made a change in the makefile to alert the missing `git
submodules update` step.

---------

Signed-off-by: Fernando Luz <prof.fernando.luz@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 64bbc1677c..89acbe564a 100644
--- a/Makefile
+++ b/Makefile
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \

From 4cd47b7d4bfcb0483f0b717d977a8cb6064dd979 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 7 Nov 2023 13:45:59 +0000
Subject: [PATCH 07/36] Dockerfile: Set BUILD_TAG for storage services (#5812)

## Problem

https://github.com/neondatabase/neon/pull/5576 added `build-tag`
reporting to `libmetrics_build_info`, but it's not reported because we
didn't set the corresponding env variable in the build process.

## Summary of changes
- Add `BUILD_TAG` env var while building services
---
 .github/workflows/build_and_test.yml | 1 +
 Dockerfile                           | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e0f8afc954..babf767fcf 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,6 +723,7 @@ jobs:
                            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
diff --git a/Dockerfile b/Dockerfile
index eb4c4bba25..60de9cfa3e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG
 
 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

From 1d68f52b57e91fe19b5a1695d75ab92775be7fd8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 7 Nov 2023 14:25:51 +0000
Subject: [PATCH 08/36] pageserver: move deletion failpoint inside backoff
 (#5814)

## Problem

When enabled, this failpoint would busy-spin in a loop that emits log
messages.

## Summary of changes

Move the failpoint inside a backoff::exponential block: it will still
spam the log, but at much lower rate.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/deletion_queue/deleter.rs | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index 81cb016d49..57421b1547 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {
 
     /// Wrap the remote `delete_objects` with a failpoint
     async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
         // A backoff::retry is used here for two reasons:
         // - To provide a backoff rather than busy-polling the API on errors
         // - To absorb transient 429/503 conditions without hitting our error
         //   logging path for issues deleting objects.
         backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
             |_| false,
             3,
             10,

From e310533ed353e8e92c8b845f403d79eb991f8a88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 7 Nov 2023 15:43:29 +0100
Subject: [PATCH 09/36] Support JWT key reload in pageserver (#5594)

## Problem

For quickly rotating JWT secrets, we want to be able to reload the JWT
public key file in the pageserver, and also support multiple JWT keys.

See #4897.

## Summary of changes

* Allow directories for the `auth_validation_public_key_path` config
param instead of just files. for the safekeepers, all of their config options
also support multiple JWT keys.
* For the pageservers, make the JWT public keys easily globally swappable
by using the `arc-swap` crate.
* Add an endpoint to the pageserver, triggered by a POST to
`/v1/reload_auth_validation_keys`, that reloads the JWT public keys from
the pre-configured path (for security reasons, you cannot upload any
keys yourself).

Fixes #4897

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 Cargo.lock                              |   7 ++
 Cargo.toml                              |   1 +
 libs/utils/Cargo.toml                   |   1 +
 libs/utils/src/auth.rs                  |  76 +++++++++++--
 libs/utils/src/http/endpoint.rs         |   4 +-
 pageserver/src/bin/pageserver.rs        |  18 ++--
 pageserver/src/config.rs                |   2 +-
 pageserver/src/http/openapi_spec.yml    |  25 +++++
 pageserver/src/http/routes.rs           |  38 ++++++-
 pageserver/src/page_service.rs          |  10 +-
 safekeeper/src/bin/safekeeper.rs        |   9 +-
 safekeeper/src/http/routes.rs           |   9 +-
 safekeeper/src/lib.rs                   |   7 +-
 test_runner/fixtures/neon_fixtures.py   |  27 ++++-
 test_runner/fixtures/pageserver/http.py |   4 +
 test_runner/regress/test_auth.py        | 137 ++++++++++++++++++++----
 16 files changed, 313 insertions(+), 62 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a7e88688ce..10f13e21fd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -170,6 +170,12 @@ dependencies = [
  "backtrace",
 ]
 
+[[package]]
+name = "arc-swap"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -5951,6 +5957,7 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arc-swap",
  "async-trait",
  "bincode",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index d992db4858..6be831a2b3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,6 +36,7 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 3f4ef2abeb..ccf6f4f2d7 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 37299e4e7f..6a26f17115 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md
 
+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{fs, sync::Arc};
 
 use anyhow::Result;
 use camino::Utf8Path;
@@ -44,31 +45,88 @@ impl Claims {
     }
 }
 
+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
     validation: Validation,
 }
 
 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
         let mut validation = Validation::default();
         validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
         // The default 'required_spec_claims' is 'exp'. But we don't want to require
         // expiration.
         validation.required_spec_claims = [].into();
         Self {
-            decoding_key,
+            decoding_keys,
             validation,
         }
     }
 
     pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
     }
 
+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
     pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(anyhow::Error::new)
+        } else {
+            anyhow::bail!("no JWT decoding keys configured")
+        }
     }
 }
 
@@ -129,7 +187,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
 
         // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
         let claims_from_token = auth.decode(encoded_eddsa)?.claims;
         assert_eq!(claims_from_token, expected_claims);
 
@@ -146,7 +204,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
 
         // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
         let decoded = auth.decode(&encoded)?;
 
         assert_eq!(decoded.claims, claims);
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index a9ee2425a4..a8a635b6fd 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }
 
 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
         if let Some(auth) = provide_auth(&req) {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 93ae43a14b..5b0c140d00 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
+    tcp_listener,
 };
 
 project_git_version!(GIT_VERSION);
@@ -321,13 +324,12 @@ fn start_pageserver(
     let http_auth;
     let pg_auth;
     if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
         let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
 
         http_auth = match &conf.http_auth_type {
             AuthType::Trust => None,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index a58c08e29b..05ed9c0cee 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
     pub http_auth_type: AuthType,
     /// authentication method for libpq connections from compute
     pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
     /// Used for both mgmt and compute auth, if enabled.
     pub auth_validation_public_key_path: Option<Utf8PathBuf>,
 
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index d31e4a1554..9bff0fd668 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,6 +52,31 @@ paths:
               schema:
                 type: object
 
+  /v1/reload_auth_validation_keys:
+    post:
+      description: Reloads the JWT public keys from their pre-configured location on disk.
+      responses:
+        "200":
+          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
   /v1/tenant/{tenant_id}:
     parameters:
       - name: tenant_id
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a0643bc580..b215b3016e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -45,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
     generation::Generation,
     http::{
         endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -64,7 +65,7 @@ use super::models::ConfigureFailpointsRequest;
 pub struct State {
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
     allowlist_routes: Vec<Uri>,
     remote_storage: Option<GenericRemoteStorage>,
     broker_client: storage_broker::BrokerClientChannel,
@@ -76,7 +77,7 @@ impl State {
     pub fn new(
         conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
-        auth: Option<Arc<JwtAuth>>,
+        auth: Option<Arc<SwappableJwtAuth>>,
         remote_storage: Option<GenericRemoteStorage>,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -392,6 +393,32 @@ async fn status_handler(
     json_response(StatusCode::OK, StatusResponse { id: config.id })
 }
 
+async fn reload_auth_validation_keys_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let config = get_config(&request);
+    let state = get_state(&request);
+    let Some(shared_auth) = &state.auth else {
+        return json_response(StatusCode::BAD_REQUEST, ());
+    };
+    // unwrap is ok because check is performed when creating config, so path is set and exists
+    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
+    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
+
+    match JwtAuth::from_key_path(key_path) {
+        Ok(new_auth) => {
+            shared_auth.swap(new_auth);
+            json_response(StatusCode::OK, ())
+        }
+        Err(e) => {
+            warn!("Error reloading public keys from {key_path:?}: {e:}");
+            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+        }
+    }
+}
+
 async fn timeline_create_handler(
     mut request: Request<Body>,
     _cancel: CancellationToken,
@@ -1692,7 +1719,7 @@ where
 pub fn make_router(
     state: Arc<State>,
     launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
     let spec = include_bytes!("openapi_spec.yml");
     let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1721,6 +1748,9 @@ pub fn make_router(
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
         })
+        .post("/v1/reload_auth_validation_keys", |r| {
+            api_handler(r, reload_auth_validation_keys_handler)
+        })
         .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
         .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
         .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6086d0b063..be9782847b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -39,7 +39,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, JwtAuth, Scope},
+    auth::{Claims, Scope, SwappableJwtAuth},
     id::{TenantId, TimelineId},
     lsn::Lsn,
     simple_rcu::RcuReadGuard,
@@ -121,7 +121,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
     listener_ctx: RequestContext,
@@ -189,7 +189,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
     connection_ctx: RequestContext,
@@ -252,7 +252,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
     _conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
     /// The context created for the lifetime of the connection
@@ -266,7 +266,7 @@ impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
         broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<JwtAuth>>,
+        auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index d476cf33ae..2e54380471 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::{JwtAuth, Scope};
+use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::{
     id::NodeId,
     logging::{self, LogFormat},
@@ -251,10 +251,9 @@ async fn main() -> anyhow::Result<()> {
             None
         }
         Some(path) => {
-            info!("loading http auth JWT key from {path}");
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
+            info!("loading http auth JWT key(s) from {path}");
+            let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
+            Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
         }
     };
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 06b903719e..c48b5330b3 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
     http::{
         endpoint::{self, auth_middleware, check_permission_with},
         error::ApiError,
@@ -428,8 +428,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             if ALLOWLIST_ROUTES.contains(request.uri()) {
                 None
             } else {
-                // Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
-                request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
+                // Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
+                request
+                    .data::<Option<Arc<SwappableJwtAuth>>>()
+                    .unwrap()
+                    .as_deref()
             }
         }))
     }
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 344b0b14e4..3a086f1f54 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,7 +7,10 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;
 
-use utils::id::{NodeId, TenantId, TenantTimelineId};
+use utils::{
+    auth::SwappableJwtAuth,
+    id::{NodeId, TenantId, TenantTimelineId},
+};
 
 mod auth;
 pub mod broker;
@@ -70,7 +73,7 @@ pub struct SafeKeeperConf {
     pub wal_backup_enabled: bool,
     pub pg_auth: Option<Arc<JwtAuth>>,
     pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
-    pub http_auth: Option<Arc<JwtAuth>>,
+    pub http_auth: Option<Arc<SwappableJwtAuth>>,
     pub current_thread_runtime: bool,
 }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d00bf8c4d8..4057f620a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -361,7 +361,6 @@ class PgProtocol:
 
 @dataclass
 class AuthKeys:
-    pub: str
     priv: str
 
     def generate_token(self, *, scope: str, **token_data: str) -> str:
@@ -877,9 +876,31 @@ class NeonEnv:
 
     @cached_property
     def auth_keys(self) -> AuthKeys:
-        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
         priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
-        return AuthKeys(pub=pub, priv=priv)
+        return AuthKeys(priv=priv)
+
+    def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
+        # compare generate_auth_keys() in local_env.rs
+        subprocess.run(
+            ["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
+            cwd=self.repo_dir,
+            check=True,
+        )
+
+        subprocess.run(
+            [
+                "openssl",
+                "pkey",
+                "-in",
+                privkey_path,
+                "-pubout",
+                "-out",
+                pubkey_path,
+            ],
+            cwd=self.repo_dir,
+            check=True,
+        )
+        del self.auth_keys
 
     def generate_endpoint_id(self) -> str:
         """
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d2c3715c52..aff7959aa7 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -189,6 +189,10 @@ class PageserverHttpClient(requests.Session):
         assert res_json is None
         return res_json
 
+    def reload_auth_validation_keys(self):
+        res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
+        self.verbose_error(res)
+
     def tenant_list(self) -> List[Dict[Any, Any]]:
         res = self.get(f"http://localhost:{self.port}/v1/tenant")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 76b75c1caf..f785dc0c8e 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,12 +1,35 @@
+import os
 from contextlib import closing
+from pathlib import Path
 
 import psycopg2
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
-from fixtures.pageserver.http import PageserverApiException
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgProtocol,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.types import TenantId, TimelineId
 
 
+def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
+    http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )
+
+
+def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
+    with pytest.raises(
+        PageserverApiException,
+        match="Unauthorized: malformed jwt token",
+    ):
+        assert_client_authorized(env, http_client)
+
+
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
@@ -27,30 +50,16 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
     ps.safe_psql("set FOO", password=pageserver_token)
 
     # tenant can create branches
-    tenant_http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
+    assert_client_authorized(env, tenant_http_client)
+
     # console can create branches for tenant
-    pageserver_http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
+    assert_client_authorized(env, pageserver_http_client)
 
     # fail to create branch using token with different tenant_id
     with pytest.raises(
         PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
     ):
-        invalid_tenant_http_client.timeline_create(
-            pg_version=env.pg_version,
-            tenant_id=env.initial_tenant,
-            new_timeline_id=TimelineId.generate(),
-            ancestor_timeline_id=env.initial_timeline,
-        )
+        assert_client_authorized(env, invalid_tenant_http_client)
 
     # create tenant using management token
     pageserver_http_client.tenant_create(TenantId.generate())
@@ -82,6 +91,94 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
             assert cur.fetchone() == (5000050000,)
 
 
+def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+
+    pageserver_token_old = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
+
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # This test is to ensure that the pageserver supports multiple keys.
+    # The neon_local tool generates one key pair at a hardcoded path by default.
+    # As a preparation for our test, move the public key of the key pair into a
+    # directory at the same location as the hardcoded path by:
+    # 1. moving the the file at `configured_pub_key_path` to a temporary location
+    # 2. creating a new directory at `configured_pub_key_path`
+    # 3. moving the file from the temporary location into the newly created directory
+    configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
+    os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
+    os.mkdir(configured_pub_key_path)
+    os.rename(
+        Path(env.repo_dir) / "auth_public_key.pem.file",
+        configured_pub_key_path / "auth_public_key_old.pem",
+    )
+
+    # Add a new key pair
+    # This invalidates env.auth_keys and makes them be regenerated
+    env.regenerate_keys_at(
+        Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
+    )
+
+    # Reload the keys on the pageserver side
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # We can continue doing things using the old token
+    assert_client_authorized(env, pageserver_http_client_old)
+
+    pageserver_token_new = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
+
+    # The new token also works
+    assert_client_authorized(env, pageserver_http_client_new)
+
+    # Remove the old token and reload
+    os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Reloading fails now with the old token, but the new token still works
+    assert_client_not_authorized(env, pageserver_http_client_old)
+    assert_client_authorized(env, pageserver_http_client_new)
+
+
+def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+
+    pageserver_token_old = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
+
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Regenerate the keys
+    env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
+
+    # Reload the keys on the pageserver side
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Next attempt fails as we use the old auth token
+    with pytest.raises(
+        PageserverApiException,
+        match="Unauthorized: malformed jwt token",
+    ):
+        pageserver_http_client_old.reload_auth_validation_keys()
+
+    # same goes for attempts trying to create a timeline
+    assert_client_not_authorized(env, pageserver_http_client_old)
+
+    pageserver_token_new = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
+
+    # timeline creation works with the new token
+    assert_client_authorized(env, pageserver_http_client_new)
+
+    # reloading also works with the new token
+    pageserver_http_client_new.reload_auth_validation_keys()
+
+
 @pytest.mark.parametrize("auth_enabled", [False, True])
 def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     neon_env_builder.auth_enabled = auth_enabled

From fc47af156f4d712e04722a3319a24e4c136c1f0b Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Tue, 7 Nov 2023 16:49:26 +0100
Subject: [PATCH 10/36] Passing neon options to the console (#5781)

The idea is to pass neon_* prefixed options to control plane. It can be
used by cplane to dynamically create timelines and computes. Such
options also should be excluded from passing to compute. Another issue
is how connection caching is working now, because compute's instance now
depends not only on hostname but probably on such options too I included
them to cache key.
---
 proxy/src/auth/credentials.rs         | 39 +++++++++++++++++++++++++--
 proxy/src/compute.rs                  |  9 ++++++-
 proxy/src/console/provider.rs         |  1 +
 proxy/src/console/provider/neon.rs    |  3 ++-
 proxy/src/proxy.rs                    | 31 ++++++++++++++++++++-
 proxy/src/proxy/tests.rs              |  1 +
 proxy/src/serverless/conn_pool.rs     | 21 ++++++++-------
 proxy/src/serverless/sql_over_http.rs | 12 +++++++++
 8 files changed, 103 insertions(+), 14 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 7dc304d7ac..d7a8edca79 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,6 +1,8 @@
 //! User credentials used in authentication.
 
-use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use crate::{
+    auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
+};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
@@ -38,6 +40,8 @@ pub struct ClientCredentials<'a> {
     pub user: &'a str,
     // TODO: this is a severe misnomer! We should think of a new name ASAP.
     pub project: Option<String>,
+
+    pub cache_key: String,
 }
 
 impl ClientCredentials<'_> {
@@ -53,6 +57,7 @@ impl<'a> ClientCredentials<'a> {
         ClientCredentials {
             user: "",
             project: None,
+            cache_key: "".to_string(),
         }
     }
 
@@ -120,7 +125,17 @@ impl<'a> ClientCredentials<'a> {
 
         info!(user, project = project.as_deref(), "credentials");
 
-        Ok(Self { user, project })
+        let cache_key = format!(
+            "{}{}",
+            project.as_deref().unwrap_or(""),
+            neon_options(params).unwrap_or("".to_string())
+        );
+
+        Ok(Self {
+            user,
+            project,
+            cache_key,
+        })
     }
 }
 
@@ -176,6 +191,7 @@ mod tests {
         let creds = ClientCredentials::parse(&options, sni, common_names)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("foo"));
+        assert_eq!(creds.cache_key, "foo");
 
         Ok(())
     }
@@ -303,4 +319,23 @@ mod tests {
             _ => panic!("bad error: {err:?}"),
         }
     }
+
+    #[test]
+    fn parse_neon_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
+        ]);
+
+        let sni = Some("project.localhost");
+        let common_names = Some(["localhost".into()].into());
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("project"));
+        assert_eq!(
+            creds.cache_key,
+            "projectneon_endpoint_type:read_write neon_lsn:0/2"
+        );
+
+        Ok(())
+    }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index e96b79ed92..53eb0e3a76 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -3,6 +3,7 @@ use crate::{
     cancellation::CancelClosure,
     console::errors::WakeComputeError,
     error::{io_error, UserFacingError},
+    proxy::is_neon_param,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -278,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
     let options: String = params
         .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none())
+        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
 
@@ -313,5 +314,11 @@ mod tests {
 
         let params = StartupMessageParams::new([("options", "project = foo")]);
         assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 32c3467092..c7cfc88c75 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -178,6 +178,7 @@ pub struct ConsoleReqExtra<'a> {
     pub session_id: uuid::Uuid,
     /// Name of client application, if set.
     pub application_name: Option<&'a str>,
+    pub options: Option<&'a str>,
 }
 
 /// Auth secret which is managed by the cloud.
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 927fea0a13..6229840c46 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -99,6 +99,7 @@ impl Api {
                 .query(&[
                     ("application_name", extra.application_name),
                     ("project", Some(project)),
+                    ("options", extra.options),
                 ])
                 .build()?;
 
@@ -151,7 +152,7 @@ impl super::Api for Api {
         extra: &ConsoleReqExtra<'_>,
         creds: &ClientCredentials,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = creds.project().expect("impossible");
+        let key: &str = &creds.cache_key;
 
         // Every time we do a wakeup http request, the compute node will stay up
         // for some time (highly depends on the console's scale-to-zero policy);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 884aae1651..54c3503c93 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,10 +15,12 @@ use crate::{
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use itertools::Itertools;
 use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
-use once_cell::sync::Lazy;
+use once_cell::sync::{Lazy, OnceCell};
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use prometheus::{register_histogram_vec, HistogramVec};
+use regex::Regex;
 use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
     io::{AsyncRead, AsyncWrite, AsyncWriteExt},
@@ -881,9 +883,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             allow_self_signed_compute,
         } = self;
 
+        let console_options = neon_options(params);
+
         let extra = console::ConsoleReqExtra {
             session_id, // aka this connection's id
             application_name: params.get("application_name"),
+            options: console_options.as_deref(),
         };
 
         let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -945,3 +950,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         proxy_pass(stream, node.stream, &aux).await
     }
 }
+
+pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| is_neon_param(opt))
+        .sorted() // we sort it to use as cache key
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+pub fn is_neon_param(bytes: &str) -> bool {
+    static RE: OnceCell<Regex> = OnceCell::new();
+    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
+
+    RE.get().unwrap().is_match(bytes)
+}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 142c32fb84..3ae4df46ef 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -440,6 +440,7 @@ fn helper_create_connect_info(
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
         application_name: Some("TEST"),
+        options: None,
     };
     let creds = auth::BackendType::Test(mechanism);
     (cache, extra, creds)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c5bfc32568..d09554a922 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,7 +22,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 
 use crate::{
     auth, console,
-    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
+    proxy::{
+        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
+        NUM_DB_CONNECTIONS_OPENED_COUNTER,
+    },
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
@@ -41,6 +44,7 @@ pub struct ConnInfo {
     pub dbname: String,
     pub hostname: String,
     pub password: String,
+    pub options: Option<String>,
 }
 
 impl ConnInfo {
@@ -401,26 +405,25 @@ async fn connect_to_compute(
     let tls = config.tls_config.as_ref();
     let common_names = tls.and_then(|tls| tls.common_names.clone());
 
-    let credential_params = StartupMessageParams::new([
+    let params = StartupMessageParams::new([
         ("user", &conn_info.username),
         ("database", &conn_info.dbname),
         ("application_name", APP_NAME),
+        ("options", conn_info.options.as_deref().unwrap_or("")),
     ]);
 
     let creds = config
         .auth_backend
         .as_ref()
-        .map(|_| {
-            auth::ClientCredentials::parse(
-                &credential_params,
-                Some(&conn_info.hostname),
-                common_names,
-            )
-        })
+        .map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
         .transpose()?;
+
+    let console_options = neon_options(&params);
+
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
         application_name: Some(APP_NAME),
+        options: console_options.as_deref(),
     };
 
     let node_info = creds
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 8f7cf7fbaf..16736ac00d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -174,11 +174,23 @@ fn get_conn_info(
         }
     }
 
+    let pairs = connection_url.query_pairs();
+
+    let mut options = Option::None;
+
+    for (key, value) in pairs {
+        if key == "options" {
+            options = Some(value.to_string());
+            break;
+        }
+    }
+
     Ok(ConnInfo {
         username: username.to_owned(),
         dbname: dbname.to_owned(),
         hostname: hostname.to_owned(),
         password: password.to_owned(),
+        options,
     })
 }
 

From 11d9d801b50f9e9b9085d7dd700dccb7cda553b6 Mon Sep 17 00:00:00 2001
From: duguorong009 <80258679+duguorong009@users.noreply.github.com>
Date: Tue, 7 Nov 2023 11:57:26 -0500
Subject: [PATCH 11/36] pageserver: improve the shutdown log error (#5792)

## Problem
- Close #5784

## Summary of changes
- Update the `GetActiveTenantError` -> `QueryError` conversion process
in `pageserver/src/page_service.rs`
- Update the pytest logging exceptions in
`./test_runner/regress/test_tenant_detach.py`
---
 pageserver/src/page_service.rs                                | 4 ++++
 .../regress/test_pageserver_restarts_under_workload.py        | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index be9782847b..5193b3c5ff 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -1330,6 +1331,9 @@ impl From<GetActiveTenantError> for QueryError {
             GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                 ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
             ),
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                QueryError::Shutdown
+            }
             e => QueryError::Other(anyhow::anyhow!(e)),
         }
     }
diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py
index d07b8dbe6b..65569f3bac 100644
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,10 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
     n_restarts = 10
     scale = 10
 
-    # Pageserver currently logs requests on non-active tenants at error level
-    # https://github.com/neondatabase/neon/issues/5784
-    env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
-
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])

From acef742a6e471e214dfe6e88e8e94a77b6fcd2db Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 7 Nov 2023 09:41:20 -0800
Subject: [PATCH 12/36] vm-monitor: Remove dependency on workspace_hack (#5752)

neondatabase/autoscaling builds libs/vm-monitor during CI because it's a
necessary component of autoscaling.

workspace_hack includes a lot of crates that are not necessary for
vm-monitor, which artificially inflates the build time on the
autoscaling side, so hopefully removing the dependency should speed
things up.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 .config/hakari.toml        | 6 ++++++
 Cargo.lock                 | 1 -
 libs/vm_monitor/Cargo.toml | 3 +--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.config/hakari.toml b/.config/hakari.toml
index 15b939e86f..9913ecc9c0 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
     # "x86_64-pc-windows-msvc",
 ]
 
+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
diff --git a/Cargo.lock b/Cargo.lock
index 10f13e21fd..5e7fce3e8d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6055,7 +6055,6 @@ dependencies = [
  "tokio-util",
  "tracing",
  "tracing-subscriber",
- "workspace_hack",
 ]
 
 [[package]]
diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml
index 26b976830a..46e9f880a1 100644
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"

From b989ad1922a46dbb6bc21d71b805246c22519139 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 Nov 2023 11:26:56 +0000
Subject: [PATCH 13/36] extend test_change_pageserver for failure case, rework
 changing pageserver (#5693)

Reproducer for https://github.com/neondatabase/neon/issues/5692

The test change in this PR intentionally fails, to demonstrate the
issue.

---------

Co-authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>
---
 compute_tools/src/compute.rs                  |  10 +-
 pgxn/neon/libpagestore.c                      | 142 ++++++++++++++++--
 test_runner/regress/test_change_pageserver.py |  77 ++++++++--
 3 files changed, 200 insertions(+), 29 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 9fd88e5818..373f05ab2f 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,8 +710,12 @@ impl ComputeNode {
     // `pg_ctl` for start / stop, so this just seems much easier to do as we already
     // have opened connection to Postgres and superuser access.
     #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
         Ok(())
     }
 
@@ -724,9 +728,9 @@ impl ComputeNode {
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        self.pg_reload_conf()?;
 
         let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
         // Disable DDL forwarding because control plane already knows about these roles/databases.
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 0a944a6bf0..cc09fb849d 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,7 +19,10 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
+#include "storage/lwlock.h"
+#include "storage/ipc.h"
 #include "c.h"
+#include "postmaster/interrupt.h"
 
 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -61,23 +64,63 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;
 
+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
+typedef struct
+{
+    LWLockId lock;
+    pg_atomic_uint64 update_counter;
+    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+} PagestoreShmemState;
+
+#if PG_VERSION_NUM >= 150000
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static void walproposer_shmem_request(void);
+#endif
+static shmem_startup_hook_type prev_shmem_startup_hook;
+static PagestoreShmemState *pagestore_shared;
+static uint64 pagestore_local_counter = 0;
+static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
 
 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
 
-
-static pqsigfunc	 prev_signal_handler;
+static bool
+CheckPageserverConnstring(char **newval, void **extra, GucSource source)
+{
+    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+}
 
 static void
-pageserver_sighup_handler(SIGNAL_ARGS)
+AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (prev_signal_handler)
-	{
-        	prev_signal_handler(postgres_signal_arg);
-	}
-	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
-	pageserver_disconnect();
+    if(!pagestore_shared)
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
+    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
+    LWLockRelease(pagestore_shared->lock);
+}
+
+static bool
+CheckConnstringUpdated()
+{
+    if(!pagestore_shared)
+        return false;
+    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+}
+
+static void
+ReloadConnstring()
+{
+    if(!pagestore_shared)
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
+    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
+    LWLockRelease(pagestore_shared->lock);
 }
 
 static bool
@@ -91,6 +134,11 @@ pageserver_connect(int elevel)
 
 	Assert(!connected);
 
+        if(CheckConnstringUpdated())
+        {
+            ReloadConnstring();
+        }
+
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -110,7 +158,7 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = page_server_connstring;
+	values[n] = local_pageserver_connstring;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -254,6 +302,12 @@ pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
 
+        if(CheckConnstringUpdated())
+        {
+            pageserver_disconnect();
+            ReloadConnstring();
+        }
+
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -274,6 +328,7 @@ pageserver_send(NeonRequest * request)
 	{
 		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
+			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
 			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
@@ -391,7 +446,8 @@ pageserver_flush(void)
 	return true;
 }
 
-page_server_api api = {
+page_server_api api =
+{
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive
@@ -405,12 +461,72 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }
 
+static Size
+PagestoreShmemSize(void)
+{
+    return sizeof(PagestoreShmemState);
+}
+
+static bool
+PagestoreShmemInit(void)
+{
+    bool found;
+    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+    pagestore_shared = ShmemInitStruct("libpagestore shared state",
+                                       PagestoreShmemSize(),
+                                       &found);
+    if(!found)
+    {
+        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
+        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+        AssignPageserverConnstring(page_server_connstring, NULL);
+    }
+    LWLockRelease(AddinShmemInitLock);
+    return found;
+}
+
+static void
+pagestore_shmem_startup_hook(void)
+{
+    if(prev_shmem_startup_hook)
+        prev_shmem_startup_hook();
+
+    PagestoreShmemInit();
+}
+
+static void
+pagestore_shmem_request(void)
+{
+#if PG_VERSION_NUM >= 150000
+    if(prev_shmem_request_hook)
+        prev_shmem_request_hook();
+#endif
+
+    RequestAddinShmemSpace(PagestoreShmemSize());
+    RequestNamedLWLockTranche("neon_libpagestore", 1);
+}
+
+static void
+pagestore_prepare_shmem(void)
+{
+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = pagestore_shmem_request;
+#else
+        pagestore_shmem_request();
+#endif
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pagestore_shmem_startup_hook;
+}
+
 /*
  * Module initialization function
  */
 void
 pg_init_libpagestore(void)
 {
+        pagestore_prepare_shmem();
+
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -418,7 +534,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   NULL, NULL, NULL);
+							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);
 
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
@@ -499,7 +615,5 @@ pg_init_libpagestore(void)
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 
-        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
-
 	lfc_init();
 }
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 31092b70b9..410bf03c2b 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -1,9 +1,13 @@
+import asyncio
+
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind
 
 
 def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
+    num_connections = 3
+
     neon_env_builder.num_pageservers = 2
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -16,15 +20,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     alt_pageserver_id = env.pageservers[1].id
     env.pageservers[1].tenant_attach(env.initial_tenant)
 
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
+    pg_conns = [endpoint.connect() for i in range(num_connections)]
+    curs = [pg_conn.cursor() for pg_conn in pg_conns]
+
+    def execute(statement: str):
+        for cur in curs:
+            cur.execute(statement)
+
+    def fetchone():
+        results = [cur.fetchone() for cur in curs]
+        assert all(result == results[0] for result in results)
+        return results[0]
 
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
     # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
+    curs[0].execute("CREATE TABLE foo (t text)")
+    curs[0].execute(
         """
         INSERT INTO foo
             SELECT 'long string to consume some space' || g
@@ -33,25 +46,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     )
 
     # Verify that the table is larger than shared_buffers
-    cur.execute(
+    curs[0].execute(
         """
         select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
         """
     )
-    row = cur.fetchone()
+    row = curs[0].fetchone()
     assert row is not None
     log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
     assert int(row[0]) < int(row[1])
 
-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
 
     endpoint.reconfigure(pageserver_id=alt_pageserver_id)
 
     # Verify that the neon.pageserver_connstring GUC is set to the correct thing
-    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
-    connstring = cur.fetchone()
+    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = fetchone()
     assert connstring is not None
     expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
     assert expected_connstring == expected_connstring
@@ -60,5 +73,45 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
         0
     ].stop()  # Stop the old pageserver just to make sure we're reading from the new one
 
-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
+
+    # Try failing back, and this time we will stop the current pageserver before reconfiguring
+    # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
+    # is more like what happens in an unexpected  pageserver failure.
+    env.pageservers[0].start()
+    env.pageservers[1].stop()
+
+    endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
+
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
+
+    env.pageservers[0].stop()
+    env.pageservers[1].start()
+
+    # Test a (former) bug where a child process spins without updating its connection string
+    # by executing a query separately. This query will hang until we issue the reconfigure.
+    async def reconfigure_async():
+        await asyncio.sleep(
+            1
+        )  # Sleep for 1 second just to make sure we actually started our count(*) query
+        endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
+
+    def execute_count():
+        execute("SELECT count(*) FROM FOO")
+
+    async def execute_and_reconfigure():
+        task_exec = asyncio.to_thread(execute_count)
+        task_reconfig = asyncio.create_task(reconfigure_async())
+        await asyncio.gather(
+            task_exec,
+            task_reconfig,
+        )
+
+    asyncio.run(execute_and_reconfigure())
+    assert fetchone() == (100000,)
+
+    # One final check that nothing hangs
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)

From a8a39cd4647f2b4123ca6a4d47ce8042bd3e1ce5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 Nov 2023 12:41:48 +0000
Subject: [PATCH 14/36] test: de-flake test_deletion_queue_recovery (#5822)

## Problem

This test could fail if timing is unlucky, and the deletions in the test
land in two deletion lists instead of one.

## Summary of changes

We await _some_ validations instead of _all_ validations, because our
execution failpoint
will prevent validation proceeding for any but the first DeletionList.
Usually the workload
just generates one, but if it generates two due to timing, then we must
not expect that the
second one will be validated.
---
 .../regress/test_pageserver_generations.py    | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 3e5021ae06..c3f4ad476f 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -366,11 +366,17 @@ def test_deletion_queue_recovery(
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
     if validate_before == ValidateBefore.VALIDATE:
+        # At this point, one or more DeletionLists have been written.  We have set a failpoint
+        # to prevent them successfully executing, but we want to see them get validated.
+        #
+        # We await _some_ validations instead of _all_ validations, because our execution failpoint
+        # will prevent validation proceeding for any but the first DeletionList.  Usually the workload
+        # just generates one, but if it generates two due to timing, then we must not expect that the
+        # second one will be validated.
+        def assert_some_validations():
+            assert get_deletion_queue_validated(ps_http) > 0
 
-        def assert_validation_complete():
-            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
-
-        wait_until(20, 1, assert_validation_complete)
+        wait_until(20, 1, assert_some_validations)
 
         # The validatated keys statistic advances before the header is written, so we
         # also wait to see the header hit the disk: this seems paranoid but the race
@@ -380,6 +386,11 @@ def test_deletion_queue_recovery(
 
         wait_until(20, 1, assert_header_written)
 
+        # If we will lose attachment, then our expectation on restart is that only the ones
+        # we already validated will execute.  Act like only those were present in the queue.
+        if keep_attachment == KeepAttachment.LOSE:
+            before_restart_depth = get_deletion_queue_validated(ps_http)
+
     log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
     env.pageserver.stop(immediate=True)
 
@@ -402,11 +413,13 @@ def test_deletion_queue_recovery(
     ps_http.deletion_queue_flush(execute=True)
     wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
 
-    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
+    if keep_attachment == KeepAttachment.KEEP:
         # - If we kept the attachment, then our pre-restart deletions should execute
         #   because on re-attach they were from the immediately preceding generation
-        # - If we validated before restart, then the deletions should execute because the
-        #   deletion queue header records a validated deletion list sequence number.
+        assert get_deletion_queue_executed(ps_http) == before_restart_depth
+    elif validate_before == ValidateBefore.VALIDATE:
+        # - If we validated before restart, then we should execute however many keys were
+        #   validated before restart.
         assert get_deletion_queue_executed(ps_http) == before_restart_depth
     else:
         env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

From 40441f8ada8d4daf7ec9839224090931035f2589 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 Nov 2023 13:00:11 +0000
Subject: [PATCH 15/36] pageserver: use `Gate` for stronger safety check in
 `SlotGuard` (#5793)

## Problem

#5711 and #5367 raced -- the `SlotGuard` type needs `Gate` to properly
enforce its invariant that we may not drop an `Arc<Tenant>` from a slot.

## Summary of changes

Replace the TODO with the intended check of Gate.
---
 libs/utils/src/sync/gate.rs  | 7 +++++++
 pageserver/src/tenant/mgr.rs | 9 +--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 1391d238e6..9aad0af22d 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -85,6 +85,13 @@ impl Gate {
         warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
     }
 
+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
     async fn do_close(&self) {
         tracing::debug!(gate = self.name, "Closing Gate...");
         match self.sem.acquire_many(Self::MAX_UNITS).await {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 27f9d50c54..07d1618272 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1552,14 +1552,7 @@ impl SlotGuard {
     /// is responsible for protecting
     fn old_value_is_shutdown(&self) -> bool {
         match self.old_value.as_ref() {
-            Some(TenantSlot::Attached(tenant)) => {
-                // TODO: PR #5711 will add a gate that enables properly checking that
-                // shutdown completed.
-                matches!(
-                    tenant.current_state(),
-                    TenantState::Stopping { .. } | TenantState::Broken { .. }
-                )
-            }
+            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
             Some(TenantSlot::Secondary) => {
                 // TODO: when adding secondary mode tenants, this will check for shutdown
                 // in the same way that we do for `Tenant` above

From e9b227a11e4d1f5df09cca9d72e32c95f02750e3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 8 Nov 2023 17:54:33 +0100
Subject: [PATCH 16/36] cleanup unused RemoteStorage fields (#5830)

Found this while working on #5771
---
 compute_tools/src/extension_server.rs        |  4 +--
 libs/remote_storage/src/lib.rs               | 36 ++------------------
 libs/remote_storage/tests/test_real_azure.rs |  4 +--
 libs/remote_storage/tests/test_real_s3.rs    |  4 +--
 pageserver/src/config.rs                     |  8 -----
 pageserver/src/deletion_queue.rs             |  8 -----
 pageserver/src/tenant.rs                     |  4 ---
 7 files changed, 5 insertions(+), 63 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 3d7ed8c360..2278509c1f 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -78,7 +78,7 @@ use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::path::Path;
 use std::str;
 use tar::Archive;
@@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
         max_keys_per_list_response: None,
     };
     let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
         storage: RemoteStorageKind::AwsS3(config),
     };
     GenericRemoteStorage::from_config(&config)
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 7054610d9e..e6d306ff66 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -14,13 +14,7 @@ mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
 
-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
 
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -36,12 +30,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -443,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
     /// The storage connection configuration.
     pub storage: RemoteStorageKind,
 }
@@ -542,18 +526,6 @@ impl RemoteStorageConfig {
 
         let use_azure = container_name.is_some() && container_region.is_some();
 
-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
         let default_concurrency_limit = if use_azure {
             DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
         } else {
@@ -635,11 +607,7 @@ impl RemoteStorageConfig {
             }
         };
 
-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
     }
 }
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 0338270aaf..59b92b48fb 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -469,8 +469,6 @@ fn create_azure_client(
     let random = rand::thread_rng().gen::<u32>();
 
     let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
         storage: RemoteStorageKind::AzureContainer(AzureConfig {
             container_name: remote_storage_azure_container,
             container_region: remote_storage_azure_region,
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 7e2aa9f6d7..e2360b7533 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -396,8 +396,6 @@ fn create_s3_client(
     let random = rand::thread_rng().gen::<u32>();
 
     let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
         storage: RemoteStorageKind::AwsS3(S3Config {
             bucket_name: remote_storage_s3_bucket,
             bucket_region: remote_storage_s3_region,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 05ed9c0cee..87d9cc522e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
             assert_eq!(
                 parsed_remote_storage_config,
                 RemoteStorageConfig {
-                    max_concurrent_syncs: NonZeroUsize::new(
-                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
-                    )
-                        .unwrap(),
-                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
-                        .unwrap(),
                     storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                 },
                 "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
             assert_eq!(
                 parsed_remote_storage_config,
                 RemoteStorageConfig {
-                    max_concurrent_syncs,
-                    max_sync_errors,
                     storage: RemoteStorageKind::AwsS3(S3Config {
                         bucket_name: bucket_name.clone(),
                         bucket_region: bucket_region.clone(),
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index b6f889c682..6a732d1029 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -893,14 +893,6 @@ mod test {
         std::fs::create_dir_all(remote_fs_dir)?;
         let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
         let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
         };
         let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a738633d5e..dc07ea7346 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3528,10 +3528,6 @@ pub(crate) mod harness {
             let remote_fs_dir = conf.workdir.join("localfs");
             std::fs::create_dir_all(&remote_fs_dir).unwrap();
             let config = RemoteStorageConfig {
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
                 storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();

From ea118a238a31187248b8ca44bb77849e63bd3fd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 8 Nov 2023 17:56:53 +0100
Subject: [PATCH 17/36] JWT logging improvements (#5823)

* lower level on auth success from info to debug (fixes #5820)
* don't log stacktraces on auth errors (as requested on slack). we do this by introducing an `AuthError` type instead of using `anyhow` and `bail`.
* return errors that have been censored for improved security.
---
 libs/postgres_backend/src/lib.rs | 12 ++++++--
 libs/utils/src/auth.rs           | 52 ++++++++++++++++++++------------
 libs/utils/src/http/endpoint.rs  | 17 ++++++-----
 libs/utils/src/http/error.rs     |  5 ++-
 pageserver/src/auth.rs           | 19 ++++++------
 pageserver/src/http/routes.rs    |  2 ++
 pageserver/src/page_service.rs   | 17 ++++++-----
 safekeeper/src/auth.rs           | 17 ++++++-----
 safekeeper/src/handler.rs        | 27 +++++++++--------
 test_runner/regress/test_auth.py | 12 ++++----
 10 files changed, 105 insertions(+), 75 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 1e25c6ed6a..92a0ec7c73 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -17,7 +17,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};
 
 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -35,6 +35,9 @@ pub enum QueryError {
     /// We were instructed to shutdown while processing the query
     #[error("Shutting down")]
     Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
     /// Some other error
     #[error(transparent)]
     Other(#[from] anyhow::Error),
@@ -51,6 +54,7 @@ impl QueryError {
         match self {
             Self::Disconnected(_) => b"08006", // connection failure
             Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
             Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
         }
     }
@@ -610,7 +614,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
 
                     if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                         self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                             Some(e.pg_error_code()),
                         ))?;
                         return Err(e);
@@ -966,6 +970,7 @@ pub fn short_error(e: &QueryError) -> String {
     match e {
         QueryError::Disconnected(connection_error) => connection_error.to_string(),
         QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
         QueryError::Other(e) => format!("{e:#}"),
     }
 }
@@ -985,6 +990,9 @@ fn log_query_error(query: &str, e: &QueryError) {
         QueryError::Shutdown => {
             info!("query handler for '{query}' cancelled during tenant shutdown")
         }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
         QueryError::Other(e) => {
             error!("query handler for '{query}' failed: {e:?}");
         }
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 6a26f17115..66b1f6e866 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -2,7 +2,7 @@
 
 use arc_swap::ArcSwap;
 use serde;
-use std::{fs, sync::Arc};
+use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
 
 use anyhow::Result;
 use camino::Utf8Path;
@@ -11,7 +11,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};
 
-use crate::id::TenantId;
+use crate::{http::error::ApiError, id::TenantId};
 
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -54,7 +54,7 @@ impl SwappableJwtAuth {
     pub fn swap(&self, jwt_auth: JwtAuth) {
         self.0.swap(Arc::new(jwt_auth));
     }
-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
         self.0.load().decode(token)
     }
 }
@@ -65,6 +65,24 @@ impl std::fmt::Debug for SwappableJwtAuth {
     }
 }
 
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct AuthError(pub Cow<'static, str>);
+
+impl Display for AuthError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 pub struct JwtAuth {
     decoding_keys: Vec<DecodingKey>,
     validation: Validation,
@@ -114,7 +132,7 @@ impl JwtAuth {
     /// The function tries the stored decoding keys in succession,
     /// and returns the first yielding a successful result.
     /// If there is no working decoding key, it returns the last error.
-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
         let mut res = None;
         for decoding_key in &self.decoding_keys {
             res = Some(decode(token, decoding_key, &self.validation));
@@ -123,9 +141,9 @@ impl JwtAuth {
             }
         }
         if let Some(res) = res {
-            res.map_err(anyhow::Error::new)
+            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
         } else {
-            anyhow::bail!("no JWT decoding keys configured")
+            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
         }
     }
 }
@@ -166,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;
 
     #[test]
-    fn test_decode() -> Result<(), anyhow::Error> {
+    fn test_decode() {
         let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
             scope: Scope::Tenant,
         };
 
@@ -187,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
 
         // Check it can be validated with the public key
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
-        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
         assert_eq!(claims_from_token, expected_claims);
-
-        Ok(())
     }
 
     #[test]
-    fn test_encode() -> Result<(), anyhow::Error> {
+    fn test_encode() {
         let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
             scope: Scope::Tenant,
         };
 
-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
 
         // decode it back
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
-        let decoded = auth.decode(&encoded)?;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let decoded = auth.decode(&encoded).unwrap();
 
         assert_eq!(decoded.claims, claims);
-
-        Ok(())
     }
 }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index a8a635b6fd..550ab10700 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, SwappableJwtAuth};
+use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                     })?;
                     let token = parse_token(header_value)?;
 
-                    let data = auth
-                        .decode(token)
-                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
+                    let data = auth.decode(token).map_err(|err| {
+                        warn!("Authentication error: {err}");
+                        // Rely on From<AuthError> for ApiError impl
+                        err
+                    })?;
                     req.set_context(data.claims);
                 }
                 None => {
@@ -450,12 +452,11 @@ where
 
 pub fn check_permission_with(
     req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
+    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
 ) -> Result<(), ApiError> {
     match req.context::<Claims>() {
-        Some(claims) => {
-            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
-        }
+        Some(claims) => Ok(check_permission(&claims)
+            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
         None => Ok(()), // claims is None because auth is disabled
     }
 }
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 7233d3a662..ac68b04888 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::{error, info, warn};
 
 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
     // Print a stack trace for Internal Server errors
 
     match api_error {
+        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
+            warn!("Error processing HTTP request: {api_error:#}")
+        }
         ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
         ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
         ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 268117cae2..2cb661863d 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,22 +1,21 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;
 
-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
     match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
         (Scope::Tenant, Some(tenant_id)) => {
             if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
             }
             Ok(())
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => {
-            bail!("SafekeeperData scope makes no sense for Pageserver")
-        }
+        (Scope::SafekeeperData, _) => Err(AuthError(
+            "SafekeeperData scope makes no sense for Pageserver".into(),
+        )),
     }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b215b3016e..7a8f37f923 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1671,6 +1671,8 @@ where
         );
 
         match handle.await {
+            // TODO: never actually return Err from here, always Ok(...) so that we can log
+            // spanned errors. Call api_error_handler instead and return appropriate Body.
             Ok(result) => result,
             Err(e) => {
                 // The handler task panicked. We have a global panic handler that logs the
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5193b3c5ff..5487a9d7c1 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -898,7 +898,7 @@ impl PageServerHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
         if self.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -910,7 +910,7 @@ impl PageServerHandler {
             .claims
             .as_ref()
             .expect("claims presence already checked");
-        check_permission(claims, tenant_id)
+        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
     }
 
     /// Shorthand for getting a reference to a Timeline of an Active tenant.
@@ -949,16 +949,17 @@ where
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .map_err(|e| QueryError::Unauthorized(e.0))?;
 
         if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing"
-            )));
+            return Err(QueryError::Unauthorized(
+                "jwt token scope is Tenant, but tenant id is missing".into(),
+            ));
         }
 
-        info!(
-            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
             data.claims.scope, data.claims.tenant_id,
         );
 
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index 2684d82365..bf4905aaa7 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,19 +1,20 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;
 
-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
     match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
         (Scope::Tenant, Some(tenant_id)) => {
             if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
             }
             Ok(())
         }
-        (Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"),
+        (Scope::PageServerApi, _) => Err(AuthError(
+            "PageServerApi scope makes no sense for Safekeeper".into(),
+        )),
         (Scope::SafekeeperData, _) => Ok(()),
     }
 }
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 6d0ed8650d..d5333abae6 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -6,7 +6,7 @@ use std::str::FromStr;
 use std::str::{self};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, info_span, Instrument};
+use tracing::{debug, info, info_span, Instrument};
 
 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -165,26 +165,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
             .auth
             .as_ref()
             .expect("auth_type is configured but .auth of handler is missing");
-        let data =
-            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+        let data = auth
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .map_err(|e| QueryError::Unauthorized(e.0))?;
 
         // The handler might be configured to allow only tenant scope tokens.
         if matches!(allowed_auth_scope, Scope::Tenant)
             && !matches!(data.claims.scope, Scope::Tenant)
         {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "passed JWT token is for full access, but only tenant scope is allowed"
-            )));
+            return Err(QueryError::Unauthorized(
+                "passed JWT token is for full access, but only tenant scope is allowed".into(),
+            ));
         }
 
         if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing"
-            )));
+            return Err(QueryError::Unauthorized(
+                "jwt token scope is Tenant, but tenant id is missing".into(),
+            ));
         }
 
-        info!(
-            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
             data.claims.scope, data.claims.tenant_id,
         );
 
@@ -263,7 +264,7 @@ impl SafekeeperPostgresHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
         if self.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -275,7 +276,7 @@ impl SafekeeperPostgresHandler {
             .claims
             .as_ref()
             .expect("claims presence already checked");
-        check_permission(claims, tenant_id)
+        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
     }
 
     async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index f785dc0c8e..f729bdee98 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -25,7 +25,7 @@ def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
 def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
     with pytest.raises(
         PageserverApiException,
-        match="Unauthorized: malformed jwt token",
+        match="Forbidden: JWT authentication error",
     ):
         assert_client_authorized(env, http_client)
 
@@ -56,9 +56,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
     assert_client_authorized(env, pageserver_http_client)
 
     # fail to create branch using token with different tenant_id
-    with pytest.raises(
-        PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
-    ):
+    with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"):
         assert_client_authorized(env, invalid_tenant_http_client)
 
     # create tenant using management token
@@ -67,7 +65,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
     # fail to create tenant using tenant token
     with pytest.raises(
         PageserverApiException,
-        match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
+        match="Forbidden: JWT authentication error",
     ):
         tenant_http_client.tenant_create(TenantId.generate())
 
@@ -94,6 +92,7 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
     env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
 
     pageserver_token_old = env.auth_keys.generate_pageserver_token()
@@ -146,6 +145,7 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
     env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
 
     pageserver_token_old = env.auth_keys.generate_pageserver_token()
@@ -162,7 +162,7 @@ def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
     # Next attempt fails as we use the old auth token
     with pytest.raises(
         PageserverApiException,
-        match="Unauthorized: malformed jwt token",
+        match="Forbidden: JWT authentication error",
     ):
         pageserver_http_client_old.reload_auth_validation_keys()
 

From 87389bc933e870cb10d0f88aabce80333d1792a6 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 8 Nov 2023 11:48:57 -0800
Subject: [PATCH 18/36] Add test simulating bad connection between pageserver
 and compute (#5728)

## Problem
We have a funny 3-day timeout for connections between the compute and
pageserver. We want to get rid of it, so to do that we need to make sure
the compute is resilient to connection failures.

Closes: https://github.com/neondatabase/neon/issues/5518

## Summary of changes
This test makes the pageserver randomly drop the connection if the
failpoint is enabled, and ensures we can keep querying the pageserver.

This PR also reduces the default timeout to 10 minutes from 3 days.
---
 libs/postgres_backend/src/lib.rs           | 11 +++-
 pageserver/src/page_service.rs             | 30 +++++++++--
 test_runner/regress/test_bad_connection.py | 60 ++++++++++++++++++++++
 3 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100644 test_runner/regress/test_bad_connection.py

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 92a0ec7c73..1dae008a4f 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -38,6 +38,8 @@ pub enum QueryError {
     /// Authentication failure
     #[error("Unauthorized: {0}")]
     Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
     /// Some other error
     #[error(transparent)]
     Other(#[from] anyhow::Error),
@@ -52,7 +54,7 @@ impl From<io::Error> for QueryError {
 impl QueryError {
     pub fn pg_error_code(&self) -> &'static [u8; 5] {
         match self {
-            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
             Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
             Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
             Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
@@ -736,6 +738,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 if let Err(e) = handler.process_query(self, query_string).await {
                     match e {
                         QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
                         e => {
                             log_query_error(query_string, &e);
                             let short_error = short_error(&e);
@@ -971,6 +976,7 @@ pub fn short_error(e: &QueryError) -> String {
         QueryError::Disconnected(connection_error) => connection_error.to_string(),
         QueryError::Shutdown => "shutdown".to_string(),
         QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
         QueryError::Other(e) => format!("{e:#}"),
     }
 }
@@ -987,6 +993,9 @@ fn log_query_error(query: &str, e: &QueryError) {
         QueryError::Disconnected(other_connection_error) => {
             error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
         }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
         QueryError::Shutdown => {
             info!("query handler for '{query}' cancelled during tenant shutdown")
         }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5487a9d7c1..2201d6c86b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -218,9 +218,27 @@ async fn page_service_conn_main(
     // no write timeout is used, because the kernel is assumed to error writes after some time.
     let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
 
-    // timeout should be lower, but trying out multiple days for
-    // <https://github.com/neondatabase/neon/issues/4205>
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
+    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
+    let socket_timeout_ms = (|| {
+        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
+            // Exponential distribution for simulating
+            // poor network conditions, expect about avg_timeout_ms to be around 15
+            // in tests
+            if let Some(avg_timeout_ms) = avg_timeout_ms {
+                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
+                let u = rand::random::<f32>();
+                ((1.0 - u).ln() / (-avg)) as u64
+            } else {
+                default_timeout_ms
+            }
+        });
+        default_timeout_ms
+    })();
+
+    // A timeout here does not mean the client died, it can happen if it's just idle for
+    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
+    // they reconnect.
+    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
     let socket = std::pin::pin!(socket);
 
     // XXX: pgbackend.run() should take the connection_ctx,
@@ -981,9 +999,13 @@ where
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
     ) -> Result<(), QueryError> {
+        fail::fail_point!("simulated-bad-compute-connection", |_| {
+            info!("Hit failpoint for bad connection");
+            Err(QueryError::SimulatedConnectionError)
+        });
+
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-
         if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
new file mode 100644
index 0000000000..ba0624c730
--- /dev/null
+++ b/test_runner/regress/test_bad_connection.py
@@ -0,0 +1,60 @@
+import random
+import time
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*simulated connection error.*")
+
+    pageserver_http = env.pageserver.http_client()
+    env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
+    endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
+
+    # Enable failpoint after starting everything else up so that loading initial
+    # basebackup doesn't fail
+    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Create table, and insert some rows. Make it big enough that it doesn't fit in
+    # shared_buffers, otherwise the SELECT after restart will just return answer
+    # from shared_buffers without hitting the page server, which defeats the point
+    # of this test.
+    cur.execute("CREATE TABLE foo (t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Verify that the table is larger than shared_buffers
+    cur.execute(
+        """
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
+        from pg_settings where name = 'shared_buffers'
+        """
+    )
+    row = cur.fetchone()
+    assert row is not None
+    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
+    assert int(row[0]) < int(row[1])
+
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000,)
+
+    end_time = time.time() + 30
+    times_executed = 0
+    while time.time() < end_time:
+        if random.random() < 0.5:
+            cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
+        else:
+            cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
+            cur.fetchall()
+        times_executed += 1
+    log.info(f"Workload executed {times_executed} times")

From 04957985918df76aa41328abd3826ff5ad6e5d31 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 9 Nov 2023 13:05:17 +0000
Subject: [PATCH 19/36] Fix walproposer build on aarch64 (#5827)

There was a compilation error due to `std::ffi::c_char` being different type on different platforms. Clippy also complained due to a similar reason.
---
 libs/walproposer/src/api_bindings.rs | 2 ++
 libs/walproposer/src/walproposer.rs  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index b0ac2a879e..7f1bbc3b80 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
     }
 }
 
+#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
     sk: *mut Safekeeper,
     buf: *mut ::std::os::raw::c_char,
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
 }
 
 /// Take ownership of `Vec<u8>` from StringInfoData.
+#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
     if pg.data.is_null() {
         return None;
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 4be15344c5..0661d3a969 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
             .unwrap()
             .into_bytes_with_nul();
         assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
 
         let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
 

From 9c30883c4baf629e9adead0457322189e6833001 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 Nov 2023 13:50:13 +0000
Subject: [PATCH 20/36] remote_storage: use S3 SDK's adaptive retry policy
 (#5813)

## Problem

Currently, we aren't doing any explicit slowdown in response to 429
responses. Recently, as we hit remote storage a bit harder (pageserver
does more ListObjectsv2 requests than it used to since #5580 ), we're
seeing storms of 429 responses that may be the result of not just doing
too may requests, but continuing to do those extra requests without
backing off any more than our usual backoff::exponential.

## Summary of changes

Switch from AWS's "Standard" retry policy to "Adaptive" -- docs describe
this as experimental but it has been around for a long time. The main
difference between Standard and Adaptive is that Adaptive rate-limits
the client in response to feedback from the server, which is meant to
avoid scenarios where the client would otherwise repeatedly hit
throttling responses.
---
 Cargo.lock                           |  1 +
 Cargo.toml                           |  1 +
 libs/remote_storage/Cargo.toml       |  1 +
 libs/remote_storage/src/s3_bucket.rs | 27 ++++++++++++++++++++++-----
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5e7fce3e8d..0a434c6ee6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4064,6 +4064,7 @@ dependencies = [
  "aws-config",
  "aws-credential-types",
  "aws-sdk-s3",
+ "aws-smithy-async",
  "aws-smithy-http",
  "aws-types",
  "azure_core",
diff --git a/Cargo.toml b/Cargo.toml
index 6be831a2b3..5b719d776b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,7 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
+aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 65b5392389..d7bcce28cb 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
+aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 560a2c14e9..ab3fd3fe62 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,23 +4,27 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::borrow::Cow;
+use std::{borrow::Cow, sync::Arc};
 
 use anyhow::Context;
 use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
     primitives::ByteStream,
     types::{Delete, ObjectIdentifier},
     Client,
 };
+use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -83,10 +87,23 @@ impl S3Bucket {
             .or_else("imds", ImdsCredentialsProvider::builder().build())
         };
 
+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
         let mut config_builder = Config::builder()
             .region(region)
             .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .credentials_provider(credentials_provider)
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
+            .retry_config(retry_config.build());
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
             config_builder = config_builder

From 7cdde285a5126491a169b7eaef6dd9e222062019 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 9 Nov 2023 14:14:30 +0000
Subject: [PATCH 21/36] proxy: limit concurrent wake_compute requests per
 endpoint (#5799)

## Problem

A user can perform many database connections at the same instant of time
- these will all cache miss and materialise as requests to the control
plane. #5705

## Summary of changes

I am using a `DashMap` (a sharded `RwLock<HashMap>`) of endpoints ->
semaphores to apply a limiter. If the limiter is enabled (permits > 0),
the semaphore will be retrieved per endpoint and a permit will be
awaited before continuing to call the wake_compute endpoint.

### Important details

This dashmap would grow uncontrollably without maintenance. It's not a
cache so I don't think an LRU-based reclamation makes sense. Instead,
I've made use of the sharding functionality of DashMap to lock a single
shard and clear out unused semaphores periodically.

I ran a test in release, using 128 tokio tasks among 12 threads each
pushing 1000 entries into the map per second, clearing a shard every 2
seconds (64 second epoch with 32 shards). The endpoint names were
sampled from a gamma distribution to make sure some overlap would occur,
and each permit was held for 1ms. The histogram for time to clear each
shard settled between 256-512us without any variance in my testing.

Holding a lock for under a millisecond for 1 of the shards does not
concern me as blocking
---
 Cargo.lock                         |   1 +
 Cargo.toml                         |   2 +-
 proxy/src/bin/proxy.rs             |  18 +++-
 proxy/src/config.rs                | 111 +++++++++++++++++++
 proxy/src/console.rs               |   5 +
 proxy/src/console/provider.rs      | 166 ++++++++++++++++++++++++++++-
 proxy/src/console/provider/neon.rs |  29 ++++-
 proxy/src/proxy.rs                 |   1 +
 workspace_hack/Cargo.toml          |   1 +
 9 files changed, 326 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0a434c6ee6..738771f88b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6483,6 +6483,7 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "dashmap",
  "either",
  "fail",
  "futures",
diff --git a/Cargo.toml b/Cargo.toml
index 5b719d776b..e528489f1e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -67,7 +67,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 28d0d95c5b..7d1b7eaaae 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -80,6 +80,9 @@ struct ProxyCliArgs {
     /// cache for `wake_compute` api method (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
     wake_compute_cache: String,
+    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    wake_compute_lock: String,
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
@@ -220,10 +223,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                 node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
             }));
 
+            let config::WakeComputeLockOptions {
+                shards,
+                permits,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(
+                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
+                    .unwrap(),
+            ));
+            tokio::spawn(locks.garbage_collect_worker(epoch));
+
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let api = console::provider::neon::Api::new(endpoint, caches);
+            let api = console::provider::neon::Api::new(endpoint, caches, locks);
             auth::BackendType::Console(Cow::Owned(api), ())
         }
         AuthBackend::Postgres => {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 9607ecd153..bd00123905 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -264,6 +264,79 @@ impl FromStr for CacheOptions {
     }
 }
 
+/// Helper for cmdline cache options parsing.
+pub struct WakeComputeLockOptions {
+    /// The number of shards the lock map should have
+    pub shards: usize,
+    /// The number of allowed concurrent requests for each endpoitn
+    pub permits: usize,
+    /// Garbage collection epoch
+    pub epoch: Duration,
+    /// Lock timeout
+    pub timeout: Duration,
+}
+
+impl WakeComputeLockOptions {
+    /// Default options for [`crate::console::provider::ApiLocks`].
+    pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
+
+    // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
+
+    /// Parse lock options passed via cmdline.
+    /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut shards = None;
+        let mut permits = None;
+        let mut epoch = None;
+        let mut timeout = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "shards" => shards = Some(value.parse()?),
+                "permits" => permits = Some(value.parse()?),
+                "epoch" => epoch = Some(humantime::parse_duration(value)?),
+                "timeout" => timeout = Some(humantime::parse_duration(value)?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        // these dont matter if lock is disabled
+        if let Some(0) = permits {
+            timeout = Some(Duration::default());
+            epoch = Some(Duration::default());
+            shards = Some(2);
+        }
+
+        let out = Self {
+            shards: shards.context("missing `shards`")?,
+            permits: permits.context("missing `permits`")?,
+            epoch: epoch.context("missing `epoch`")?,
+            timeout: timeout.context("missing `timeout`")?,
+        };
+
+        ensure!(out.shards > 1, "shard count must be > 1");
+        ensure!(
+            out.shards.is_power_of_two(),
+            "shard count must be a power of two"
+        );
+
+        Ok(out)
+    }
+}
+
+impl FromStr for WakeComputeLockOptions {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse cache lock options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -288,4 +361,42 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_parse_lock_options() -> anyhow::Result<()> {
+        let WakeComputeLockOptions {
+            epoch,
+            permits,
+            shards,
+            timeout,
+        } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
+        assert_eq!(epoch, Duration::from_secs(10 * 60));
+        assert_eq!(timeout, Duration::from_secs(1));
+        assert_eq!(shards, 32);
+        assert_eq!(permits, 4);
+
+        let WakeComputeLockOptions {
+            epoch,
+            permits,
+            shards,
+            timeout,
+        } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
+        assert_eq!(epoch, Duration::from_secs(60));
+        assert_eq!(timeout, Duration::from_millis(100));
+        assert_eq!(shards, 16);
+        assert_eq!(permits, 8);
+
+        let WakeComputeLockOptions {
+            epoch,
+            permits,
+            shards,
+            timeout,
+        } = "permits=0".parse()?;
+        assert_eq!(epoch, Duration::ZERO);
+        assert_eq!(timeout, Duration::ZERO);
+        assert_eq!(shards, 2);
+        assert_eq!(permits, 0);
+
+        Ok(())
+    }
 }
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
index 0e5eaaf845..6da627389e 100644
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -13,5 +13,10 @@ pub mod caches {
     pub use super::provider::{ApiCaches, NodeInfoCache};
 }
 
+/// Various cache-related types.
+pub mod locks {
+    pub use super::provider::ApiLocks;
+}
+
 /// Console's management API.
 pub mod mgmt;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index c7cfc88c75..54bcd1f081 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,7 +8,13 @@ use crate::{
     compute, scram,
 };
 use async_trait::async_trait;
-use std::sync::Arc;
+use dashmap::DashMap;
+use std::{sync::Arc, time::Duration};
+use tokio::{
+    sync::{OwnedSemaphorePermit, Semaphore},
+    time::Instant,
+};
+use tracing::info;
 
 pub mod errors {
     use crate::{
@@ -149,6 +155,9 @@ pub mod errors {
 
         #[error(transparent)]
         ApiError(ApiError),
+
+        #[error("Timeout waiting to acquire wake compute lock")]
+        TimeoutError,
     }
 
     // This allows more useful interactions than `#[from]`.
@@ -158,6 +167,17 @@ pub mod errors {
         }
     }
 
+    impl From<tokio::sync::AcquireError> for WakeComputeError {
+        fn from(_: tokio::sync::AcquireError) -> Self {
+            WakeComputeError::TimeoutError
+        }
+    }
+    impl From<tokio::time::error::Elapsed> for WakeComputeError {
+        fn from(_: tokio::time::error::Elapsed) -> Self {
+            WakeComputeError::TimeoutError
+        }
+    }
+
     impl UserFacingError for WakeComputeError {
         fn to_string_client(&self) -> String {
             use WakeComputeError::*;
@@ -167,6 +187,8 @@ pub mod errors {
                 BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
                 ApiError(e) => e.to_string_client(),
+
+                TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
             }
         }
     }
@@ -233,3 +255,145 @@ pub struct ApiCaches {
     /// Cache for the `wake_compute` API method.
     pub node_info: NodeInfoCache,
 }
+
+/// Various caches for [`console`](super).
+pub struct ApiLocks {
+    name: &'static str,
+    node_locks: DashMap<Arc<str>, Arc<Semaphore>>,
+    permits: usize,
+    timeout: Duration,
+    registered: prometheus::IntCounter,
+    unregistered: prometheus::IntCounter,
+    reclamation_lag: prometheus::Histogram,
+    lock_acquire_lag: prometheus::Histogram,
+}
+
+impl ApiLocks {
+    pub fn new(
+        name: &'static str,
+        permits: usize,
+        shards: usize,
+        timeout: Duration,
+    ) -> prometheus::Result<Self> {
+        let registered = prometheus::IntCounter::with_opts(
+            prometheus::Opts::new(
+                "semaphores_registered",
+                "Number of semaphores registered in this api lock",
+            )
+            .namespace(name),
+        )?;
+        prometheus::register(Box::new(registered.clone()))?;
+        let unregistered = prometheus::IntCounter::with_opts(
+            prometheus::Opts::new(
+                "semaphores_unregistered",
+                "Number of semaphores unregistered in this api lock",
+            )
+            .namespace(name),
+        )?;
+        prometheus::register(Box::new(unregistered.clone()))?;
+        let reclamation_lag = prometheus::Histogram::with_opts(
+            prometheus::HistogramOpts::new(
+                "reclamation_lag_seconds",
+                "Time it takes to reclaim unused semaphores in the api lock",
+            )
+            .namespace(name)
+            // 1us -> 65ms
+            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
+            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
+        )?;
+        prometheus::register(Box::new(reclamation_lag.clone()))?;
+        let lock_acquire_lag = prometheus::Histogram::with_opts(
+            prometheus::HistogramOpts::new(
+                "semaphore_acquire_seconds",
+                "Time it takes to reclaim unused semaphores in the api lock",
+            )
+            .namespace(name)
+            // 0.1ms -> 6s
+            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
+        )?;
+        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
+
+        Ok(Self {
+            name,
+            node_locks: DashMap::with_shard_amount(shards),
+            permits,
+            timeout,
+            lock_acquire_lag,
+            registered,
+            unregistered,
+            reclamation_lag,
+        })
+    }
+
+    pub async fn get_wake_compute_permit(
+        &self,
+        key: &Arc<str>,
+    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
+        if self.permits == 0 {
+            return Ok(WakeComputePermit { permit: None });
+        }
+        let now = Instant::now();
+        let semaphore = {
+            // get fast path
+            if let Some(semaphore) = self.node_locks.get(key) {
+                semaphore.clone()
+            } else {
+                self.node_locks
+                    .entry(key.clone())
+                    .or_insert_with(|| {
+                        self.registered.inc();
+                        Arc::new(Semaphore::new(self.permits))
+                    })
+                    .clone()
+            }
+        };
+        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
+
+        self.lock_acquire_lag
+            .observe((Instant::now() - now).as_secs_f64());
+
+        Ok(WakeComputePermit {
+            permit: Some(permit??),
+        })
+    }
+
+    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+        if self.permits == 0 {
+            return;
+        }
+
+        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+        loop {
+            for (i, shard) in self.node_locks.shards().iter().enumerate() {
+                interval.tick().await;
+                // temporary lock a single shard and then clear any semaphores that aren't currently checked out
+                // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
+                // therefore releasing it is safe from race conditions
+                info!(
+                    name = self.name,
+                    shard = i,
+                    "performing epoch reclamation on api lock"
+                );
+                let mut lock = shard.write();
+                let timer = self.reclamation_lag.start_timer();
+                let count = lock
+                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .count();
+                drop(lock);
+                self.unregistered.inc_by(count as u64);
+                timer.observe_duration()
+            }
+        }
+    }
+}
+
+pub struct WakeComputePermit {
+    // None if the lock is disabled
+    permit: Option<OwnedSemaphorePermit>,
+}
+
+impl WakeComputePermit {
+    pub fn should_check_cache(&self) -> bool {
+        self.permit.is_some()
+    }
+}
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 6229840c46..0dc7c71534 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,12 +3,12 @@
 use super::{
     super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -17,12 +17,17 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     caches: &'static ApiCaches,
+    locks: &'static ApiLocks,
     jwt: String,
 }
 
 impl Api {
     /// Construct an API object containing the auth parameters.
-    pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
+    pub fn new(
+        endpoint: http::Endpoint,
+        caches: &'static ApiCaches,
+        locks: &'static ApiLocks,
+    ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
             Err(_) => "".to_string(),
@@ -30,6 +35,7 @@ impl Api {
         Self {
             endpoint,
             caches,
+            locks,
             jwt,
         }
     }
@@ -163,9 +169,22 @@ impl super::Api for Api {
             return Ok(cached);
         }
 
+        let key: Arc<str> = key.into();
+
+        let permit = self.locks.get_wake_compute_permit(&key).await?;
+
+        // after getting back a permit - it's possible the cache was filled
+        // double check
+        if permit.should_check_cache() {
+            if let Some(cached) = self.caches.node_info.get(&key) {
+                info!(key = &*key, "found cached compute node info");
+                return Ok(cached);
+            }
+        }
+
         let node = self.do_wake_compute(extra, creds).await?;
-        let (_, cached) = self.caches.node_info.insert(key.into(), node);
-        info!(key = key, "created a cache entry for compute node info");
+        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
+        info!(key = &*key, "created a cache entry for compute node info");
 
         Ok(cached)
     }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 54c3503c93..a1ebf03545 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -570,6 +570,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
             "api_console_other_server_error"
         }
         WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
     };
     NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e2a65ad150..a088f1868b 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,6 +25,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+dashmap = { version = "5", default-features = false, features = ["raw-api"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }

From 893616051dc611006ef6ba098d708722dba3939c Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Thu, 9 Nov 2023 15:24:43 +0100
Subject: [PATCH 22/36] Update epic-template.md (#5709)

replace the checkbox list with a a proper task list in the epic template

NB: this PR does not change the code, it only touches the github issue
templates
---
 .github/ISSUE_TEMPLATE/epic-template.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index 7707e0aa67..019e6e7345 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas
 
 
-## Tasks
-- [ ]
+```[tasklist]
+### Tasks
+```
 
 
 ## Other related tasks and Epics

From 842223b47f923d3c9efc65ee4501ca1e723a2aaf Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 9 Nov 2023 16:40:52 +0200
Subject: [PATCH 23/36] fix(metric): remove pageserver_wal_redo_wait_seconds
 (#5791)

the meaning of the values recorded in this histogram changed with #5560
and we never had it visualized as a histogram, just the
`increase(_sum)`. The histogram is not too interesting to look at, so
remove it per discussion in [slack
thread](https://neondb.slack.com/archives/C063LJFF26S/p1699008316109999?thread_ts=1698852436.637559&cid=C063LJFF26S).
---
 pageserver/src/metrics.rs | 10 ----------
 pageserver/src/walredo.rs |  9 ++-------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 429ab801d9..3d11daed96 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1225,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the Postgres WAL redo process",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_records_histogram",
@@ -1928,7 +1919,6 @@ pub fn preinitialize_metrics() {
         &READ_NUM_FS_LAYERS,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
         &WAL_REDO_BYTES_HISTOGRAM,
     ]
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 38f2c64420..4a9524b5e4 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -44,7 +44,6 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use crate::config::PageServerConf;
 use crate::metrics::{
     WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
-    WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -207,11 +206,8 @@ impl PostgresRedoManager {
     ) -> anyhow::Result<Bytes> {
         let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
         const MAX_RETRY_ATTEMPTS: u32 = 1;
-        let start_time = Instant::now();
         let mut n_attempts = 0u32;
         loop {
-            let lock_time = Instant::now();
-
             // launch the WAL redo process on first use
             let proc: Arc<WalRedoProcess> = {
                 let proc_guard = self.redo_process.read().unwrap();
@@ -236,7 +232,7 @@ impl PostgresRedoManager {
                 }
             };
 
-            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            let started_at = std::time::Instant::now();
 
             // Relational WAL records are applied using wal-redo-postgres
             let buf_tag = BufferTag { rel, blknum };
@@ -244,8 +240,7 @@ impl PostgresRedoManager {
                 .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
                 .context("apply_wal_records");
 
-            let end_time = Instant::now();
-            let duration = end_time.duration_since(lock_time);
+            let duration = started_at.elapsed();
 
             let len = records.len();
             let nbytes = records.iter().fold(0, |acumulator, record| {

From 4469b1a62c869de93cb2b3261ef690db7d640189 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Thu, 9 Nov 2023 10:47:03 -0500
Subject: [PATCH 24/36] Fix blob_io test (#5818)

---
 pageserver/src/tenant/blob_io.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index bedf09a40c..6de2e95055 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                 let mut sz: u16 = rng.gen();
                 // Make 50% of the arrays small
                 if rng.gen() {
-                    sz |= 63;
+                    sz &= 63;
                 }
                 random_array(sz.into())
             })

From e0821e1eab80fc4ea147abefeb0743110dd94689 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 Nov 2023 16:02:59 +0000
Subject: [PATCH 25/36] pageserver: refined Timeline shutdown (#5833)

## Problem

We have observed the shutdown of a timeline taking a long time when a
deletion arrives at a busy time for the system. This suggests that we
are not respecting cancellation tokens promptly enough.

## Summary of changes

- Refactor timeline shutdown so that rather than having a shutdown()
function that takes a flag for optionally flushing, there are two
distinct functions, one for graceful flushing shutdown, and another that
does the "normal" shutdown where we're just setting a cancellation token
and then tearing down as fast as we can. This makes things a bit easier
to reason about, and enables us to remove the hand-written variant of
shutdown that was maintained in `delete.rs`
- Layer flush task checks cancellation token more carefully
- Logical size calculation's handling of cancellation tokens is
simplified: rather than passing one in, it respects the Timeline's
cancellation token.

This PR doesn't touch RemoteTimelineClient, which will be a key thing to
fix as well, so that a slow remote storage op doesn't hold up shutdown.
---
 libs/utils/src/seqwait.rs                     |   3 +
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/page_service.rs                |   6 +-
 pageserver/src/pgdatadir_mapping.rs           |   4 +-
 pageserver/src/tenant.rs                      |  10 +-
 pageserver/src/tenant/size.rs                 |  19 +-
 pageserver/src/tenant/timeline.rs             | 223 +++++++++++-------
 .../src/tenant/timeline/eviction_task.rs      |  16 +-
 8 files changed, 164 insertions(+), 123 deletions(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index 5bc7ca91d6..effc9c67b5 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,6 +125,9 @@ where
             // Wake everyone with an error.
             let mut internal = self.internal.lock().unwrap();
 
+            // Block any future waiters from starting
+            internal.shutdown = true;
+
             // This will steal the entire waiters map.
             // When we drop it all waiters will be woken.
             mem::take(&mut internal.waiters)
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7a8f37f923..63016042cf 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -303,11 +303,7 @@ async fn build_timeline_info(
         // we're executing this function, we will outlive the timeline on-disk state.
         info.current_logical_size_non_incremental = Some(
             timeline
-                .get_current_logical_size_non_incremental(
-                    info.last_record_lsn,
-                    CancellationToken::new(),
-                    ctx,
-                )
+                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
                 .await?,
         );
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 2201d6c86b..ee5f1732e4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -512,7 +512,11 @@ impl PageServerHandler {
             };
 
             if let Err(e) = &response {
-                if timeline.cancel.is_cancelled() {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                     // If we fail to fulfil a request during shutdown, which may be _because_ of
                     // shutdown, then do not send the error to the client.  Instead just drop the
                     // connection.
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index aa4d155bcc..9e8a6b02cc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,7 +21,6 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -578,7 +577,6 @@ impl Timeline {
     pub async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
-        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -590,7 +588,7 @@ impl Timeline {
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
-                if cancel.is_cancelled() {
+                if self.cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }
                 let relsize_key = rel_size_to_key(rel);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dc07ea7346..758f8b15a1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1841,7 +1841,13 @@ impl Tenant {
             timelines.values().for_each(|timeline| {
                 let timeline = Arc::clone(timeline);
                 let span = Span::current();
-                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
+                js.spawn(async move {
+                    if freeze_and_flush {
+                        timeline.flush_and_shutdown().instrument(span).await
+                    } else {
+                        timeline.shutdown().instrument(span).await
+                    }
+                });
             })
         };
         tracing::info!("Waiting for timelines...");
@@ -4727,7 +4733,7 @@ mod tests {
             // Keeps uninit mark in place
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
-                .shutdown(false)
+                .shutdown()
                 .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
                 .await;
             std::mem::forget(tline);
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index e4df94b8e9..a85dc9231c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,7 +6,6 @@ use std::sync::Arc;
 use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
-use tokio_util::sync::CancellationToken;
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -350,10 +349,6 @@ async fn fill_logical_sizes(
     // our advantage with `?` error handling.
     let mut joinset = tokio::task::JoinSet::new();
 
-    let cancel = tokio_util::sync::CancellationToken::new();
-    // be sure to cancel all spawned tasks if we are dropped
-    let _dg = cancel.clone().drop_guard();
-
     // For each point that would benefit from having a logical size available,
     // spawn a Task to fetch it, unless we have it cached already.
     for seg in segments.iter() {
@@ -371,15 +366,8 @@ async fn fill_logical_sizes(
                 let parallel_size_calcs = Arc::clone(limit);
                 let ctx = ctx.attached_child();
                 joinset.spawn(
-                    calculate_logical_size(
-                        parallel_size_calcs,
-                        timeline,
-                        lsn,
-                        cause,
-                        ctx,
-                        cancel.child_token(),
-                    )
-                    .in_current_span(),
+                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
+                        .in_current_span(),
                 );
             }
             e.insert(cached_size);
@@ -487,14 +475,13 @@ async fn calculate_logical_size(
     lsn: utils::lsn::Lsn,
     cause: LogicalSizeCalculationCause,
     ctx: RequestContext,
-    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
     let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 36629e0655..bbb96cb172 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -36,7 +36,6 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
     AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
     AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -50,6 +49,7 @@ use crate::tenant::{
     metadata::{save_metadata, TimelineMetadata},
     par_fsync,
 };
+use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -247,7 +247,7 @@ pub struct Timeline {
     /// the flush finishes. You can use that to wait for the flush to finish.
     layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
     /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
-    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
 
     /// Layer removal lock.
     /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -374,6 +374,19 @@ pub enum PageReconstructError {
     WalRedo(anyhow::Error),
 }
 
+#[derive(thiserror::Error, Debug)]
+enum FlushLayerError {
+    /// Timeline cancellation token was cancelled
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error(transparent)]
+    PageReconstructError(#[from] PageReconstructError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 impl std::fmt::Debug for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
@@ -891,15 +904,16 @@ impl Timeline {
         self.launch_eviction_task(background_jobs_can_start);
     }
 
+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    ///
+    /// While we are flushing, we continue to accept read I/O.
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
+    pub(crate) async fn flush_and_shutdown(&self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        // Signal any subscribers to our cancellation token to drop out
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
-        // prevent writes to the InMemoryLayer
+        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
+        // trying to flush
         tracing::debug!("Waiting for WalReceiverManager...");
         task_mgr::shutdown_tasks(
             Some(TaskKind::WalReceiverManager),
@@ -908,40 +922,70 @@ impl Timeline {
         )
         .await;
 
+        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        self.last_record_lsn.shutdown();
+
         // now all writers to InMemory layer are gone, do the final flush if requested
-        if freeze_and_flush {
-            match self.freeze_and_flush().await {
-                Ok(()) => {}
-                Err(e) => {
-                    warn!("failed to freeze and flush: {e:#}");
-                    return; // TODO: should probably drain remote timeline client anyways?
+        match self.freeze_and_flush().await {
+            Ok(_) => {
+                // drain the upload queue
+                if let Some(client) = self.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    if let Err(e) = client.wait_completion().await {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to flush to remote storage: {e:#}");
+                    }
                 }
             }
-
-            // drain the upload queue
-            let res = if let Some(client) = self.remote_client.as_ref() {
-                // if we did not wait for completion here, it might be our shutdown process
-                // didn't wait for remote uploads to complete at all, as new tasks can forever
-                // be spawned.
-                //
-                // what is problematic is the shutting down of RemoteTimelineClient, because
-                // obviously it does not make sense to stop while we wait for it, but what
-                // about corner cases like s3 suddenly hanging up?
-                client.wait_completion().await
-            } else {
-                Ok(())
-            };
-
-            if let Err(e) = res {
-                warn!("failed to await for frozen and flushed uploads: {e:#}");
+            Err(e) => {
+                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                // we have some extra WAL replay to do next time the timeline starts.
+                warn!("failed to freeze and flush: {e:#}");
             }
         }
 
+        self.shutdown().await;
+    }
+
+    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
+    /// the graceful [`Timeline::flush_and_shutdown`] function.
+    pub(crate) async fn shutdown(&self) {
+        // Signal any subscribers to our cancellation token to drop out
+        tracing::debug!("Cancelling CancellationToken");
+        self.cancel.cancel();
+
         // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
         // while doing so.
         self.last_record_lsn.shutdown();
 
+        // Shut down the layer flush task before the remote client, as one depends on the other
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::LayerFlushTask),
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
+        // case our caller wants to use that for a deletion
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            match remote_client.stop() {
+                Ok(()) => {}
+                Err(StopError::QueueUninitialized) => {
+                    // Shutting down during initialization is legal
+                }
+            }
+        }
+
         tracing::debug!("Waiting for tasks...");
+
         task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
 
         // Finally wait until any gate-holders are complete
@@ -985,7 +1029,12 @@ impl Timeline {
             reason,
             backtrace: backtrace_str,
         };
-        self.set_state(broken_state)
+        self.set_state(broken_state);
+
+        // Although the Broken state is not equivalent to shutdown() (shutdown will be called
+        // later when this tenant is detach or the process shuts down), firing the cancellation token
+        // here avoids the need for other tasks to watch for the Broken state explicitly.
+        self.cancel.cancel();
     }
 
     pub fn current_state(&self) -> TimelineState {
@@ -1741,12 +1790,8 @@ impl Timeline {
                 // delay will be terminated by a timeout regardless.
                 let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
 
-                // no extra cancellation here, because nothing really waits for this to complete compared
-                // to spawn_ondemand_logical_size_calculation.
-                let cancel = CancellationToken::new();
-
                 let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
                     .await
                 {
                     Ok(s) => s,
@@ -1815,7 +1860,6 @@ impl Timeline {
         lsn: Lsn,
         cause: LogicalSizeCalculationCause,
         ctx: RequestContext,
-        cancel: CancellationToken,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
         let (sender, receiver) = oneshot::channel();
         let self_clone = Arc::clone(self);
@@ -1836,7 +1880,7 @@ impl Timeline {
             false,
             async move {
                 let res = self_clone
-                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
+                    .logical_size_calculation_task(lsn, cause, &ctx)
                     .await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
@@ -1852,58 +1896,28 @@ impl Timeline {
         lsn: Lsn,
         cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
-        cancel: CancellationToken,
     ) -> Result<u64, CalculateLogicalSizeError> {
         span::debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let _guard = self.gate.enter();
+
         let self_calculation = Arc::clone(self);
 
         let mut calculation = pin!(async {
-            let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
             self_calculation
-                .calculate_logical_size(lsn, cause, cancel, &ctx)
+                .calculate_logical_size(lsn, cause, &ctx)
                 .await
         });
-        let timeline_state_cancellation = async {
-            loop {
-                match timeline_state_updates.changed().await {
-                    Ok(()) => {
-                        let new_state = timeline_state_updates.borrow().clone();
-                        match new_state {
-                            // we're running this job for active timelines only
-                            TimelineState::Active => continue,
-                            TimelineState::Broken { .. }
-                            | TimelineState::Stopping
-                            | TimelineState::Loading => {
-                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
-                            }
-                        }
-                    }
-                    Err(_sender_dropped_error) => {
-                        // can't happen, the sender is not dropped as long as the Timeline exists
-                        break "aborted because state watch was dropped".to_string();
-                    }
-                }
-            }
-        };
-
-        let taskmgr_shutdown_cancellation = async {
-            task_mgr::shutdown_watcher().await;
-            "aborted because task_mgr shutdown requested".to_string()
-        };
 
         tokio::select! {
             res = &mut calculation => { res }
-            reason = timeline_state_cancellation => {
-                debug!(reason = reason, "cancelling calculation");
-                cancel.cancel();
+            _ = self.cancel.cancelled() => {
+                debug!("cancelling logical size calculation for timeline shutdown");
                 calculation.await
             }
-            reason = taskmgr_shutdown_cancellation => {
-                debug!(reason = reason, "cancelling calculation");
-                cancel.cancel();
+            _ = task_mgr::shutdown_watcher() => {
+                debug!("cancelling logical size calculation for task shutdown");
                 calculation.await
             }
         }
@@ -1917,7 +1931,6 @@ impl Timeline {
         &self,
         up_to_lsn: Lsn,
         cause: LogicalSizeCalculationCause,
-        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
@@ -1960,7 +1973,7 @@ impl Timeline {
         };
         let timer = storage_time_metrics.start_timer();
         let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
+            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
             .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
@@ -2373,6 +2386,10 @@ impl Timeline {
         info!("started flush loop");
         loop {
             tokio::select! {
+                _ = self.cancel.cancelled() => {
+                    info!("shutting down layer flush task");
+                    break;
+                },
                 _ = task_mgr::shutdown_watcher() => {
                     info!("shutting down layer flush task");
                     break;
@@ -2384,6 +2401,14 @@ impl Timeline {
             let timer = self.metrics.flush_time_histo.start_timer();
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
+                if self.cancel.is_cancelled() {
+                    info!("dropping out of flush loop for timeline shutdown");
+                    // Note: we do not bother transmitting into [`layer_flush_done_tx`], because
+                    // anyone waiting on that will respect self.cancel as well: they will stop
+                    // waiting at the same time we as drop out of this loop.
+                    return;
+                }
+
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
                     guard.layer_map().frozen_layers.front().cloned()
@@ -2392,9 +2417,18 @@ impl Timeline {
                 let Some(layer_to_flush) = layer_to_flush else {
                     break Ok(());
                 };
-                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    error!("could not flush frozen layer: {err:?}");
-                    break Err(err);
+                match self.flush_frozen_layer(layer_to_flush, ctx).await {
+                    Ok(()) => {}
+                    Err(FlushLayerError::Cancelled) => {
+                        info!("dropping out of flush loop for timeline shutdown");
+                        return;
+                    }
+                    err @ Err(
+                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
+                    ) => {
+                        error!("could not flush frozen layer: {err:?}");
+                        break err;
+                    }
                 }
             };
             // Notify any listeners that we're done
@@ -2443,7 +2477,17 @@ impl Timeline {
                 }
             }
             trace!("waiting for flush to complete");
-            rx.changed().await?;
+            tokio::select! {
+                rx_e = rx.changed() => {
+                    rx_e?;
+                },
+                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
+                // the notification from [`flush_loop`] that it completed.
+                _ = self.cancel.cancelled() => {
+                    tracing::info!("Cancelled layer flush due on timeline shutdown");
+                    return Ok(())
+                }
+            };
             trace!("done")
         }
     }
@@ -2458,7 +2502,7 @@ impl Timeline {
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), FlushLayerError> {
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
@@ -2483,6 +2527,11 @@ impl Timeline {
                 let (partitioning, _lsn) = self
                     .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                     .await?;
+
+                if self.cancel.is_cancelled() {
+                    return Err(FlushLayerError::Cancelled);
+                }
+
                 // For image layers, we add them immediately into the layer map.
                 (
                     self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
@@ -2514,6 +2563,10 @@ impl Timeline {
                 )
             };
 
+        if self.cancel.is_cancelled() {
+            return Err(FlushLayerError::Cancelled);
+        }
+
         let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
         let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
 
@@ -2523,6 +2576,10 @@ impl Timeline {
         let metadata = {
             let mut guard = self.layers.write().await;
 
+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
             guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
 
             if disk_consistent_lsn != old_disk_consistent_lsn {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index e3aad22e40..183fcb872f 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -326,8 +326,7 @@ impl Timeline {
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {
-                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
-                    .await;
+                self.imitate_timeline_cached_layer_accesses(ctx).await;
                 state.last_layer_access_imitation = Some(tokio::time::Instant::now())
             }
         }
@@ -367,21 +366,12 @@ impl Timeline {
 
     /// Recompute the values which would cause on-demand downloads during restart.
     #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(
-        &self,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) {
+    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
         let lsn = self.get_last_record_lsn();
 
         // imitiate on-restart initial logical size
         let size = self
-            .calculate_logical_size(
-                lsn,
-                LogicalSizeCalculationCause::EvictionTaskImitation,
-                cancel.clone(),
-                ctx,
-            )
+            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
             .instrument(info_span!("calculate_logical_size"))
             .await;
 

From f95f001b8bc19ec0697f47bff4a0a0b7c2ce5c4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 9 Nov 2023 17:12:18 +0100
Subject: [PATCH 26/36] Lsn for get_timestamp_of_lsn should be string, not
 integer (#5840)

The `get_timestamp_of_lsn` pageserver endpoint has been added in #5497,
but the yml it added was wrong: the lsn is expected in hex format, not
in integer (decimal) format.
---
 pageserver/src/http/openapi_spec.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 9bff0fd668..4d455243f0 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -352,7 +352,8 @@ paths:
           in: query
           required: true
           schema:
-            type: integer
+            type: string
+            format: hex
           description: A LSN to get the timestamp
       responses:
         "200":

From f5344fb85a3e8f1fbf0352c12cb378cef5d20bcc Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 9 Nov 2023 23:31:53 +0200
Subject: [PATCH 27/36] temp: log all layer loading errors while we lose them
 (#5816)

Temporary workaround while some errors are not being logged.

Cc: #5815.
---
 pageserver/src/tenant/storage_layer/layer.rs | 6 +++++-
 test_runner/regress/test_broken_timeline.py  | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d72982a9a0..17df39733f 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -251,6 +251,7 @@ impl Layer {
 
         layer
             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
             .await
     }
 
@@ -1211,8 +1212,10 @@ impl DownloadedLayer {
             // this will be a permanent failure
             .context("load layer");
 
-            if res.is_err() {
+            if let Err(e) = res.as_ref() {
                 LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                // TODO(#5815): we are not logging all errors, so temporarily log them here as well
+                tracing::error!("layer loading failed permanently: {e:#}");
             }
             res
         };
@@ -1291,6 +1294,7 @@ impl ResidentLayer {
     }
 
     /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    #[tracing::instrument(skip_all, fields(layer=%self))]
     pub(crate) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index b1b47b3f2c..23839a4dd1 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -26,6 +26,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             ".*will not become active. Current state: Broken.*",
             ".*failed to load metadata.*",
             ".*load failed.*load local timeline.*",
+            ".*layer loading failed permanently: load layer: .*",
         ]
     )
 

From 8dd29f1e27ffd3652727ebd1aed36fbccf9b77ad Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 9 Nov 2023 23:36:57 +0200
Subject: [PATCH 28/36] fix(pageserver): spawn all kinds of tenant shutdowns
 (#5841)

Minor bugfix, something noticed while manual code-review. Use the same
joinset for inprogress tenants so we can get the benefit of the
buffering logging just as we get for attached tenants, and no single
inprogress task can hold up shutdown of other tenants.
---
 pageserver/src/tenant/mgr.rs | 91 ++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 07d1618272..c71eac0672 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -566,8 +566,10 @@ pub(crate) async fn shutdown_all_tenants() {
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
     use utils::completion;
 
-    // Atomically, 1. extract the list of tenants to shut down and 2. prevent creation of new tenants.
-    let (in_progress_ops, tenants_to_shut_down) = {
+    let mut join_set = JoinSet::new();
+
+    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
+    let (total_in_progress, total_attached) = {
         let mut m = tenants.write().unwrap();
         match &mut *m {
             TenantsMap::Initializing => {
@@ -577,78 +579,67 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
             }
             TenantsMap::Open(tenants) => {
                 let mut shutdown_state = HashMap::new();
-                let mut in_progress_ops = Vec::new();
-                let mut tenants_to_shut_down = Vec::new();
+                let mut total_in_progress = 0;
+                let mut total_attached = 0;
 
-                for (k, v) in tenants.drain() {
+                for (tenant_id, v) in tenants.drain() {
                     match v {
                         TenantSlot::Attached(t) => {
-                            tenants_to_shut_down.push(t.clone());
-                            shutdown_state.insert(k, TenantSlot::Attached(t));
+                            shutdown_state.insert(tenant_id, TenantSlot::Attached(t.clone()));
+                            join_set.spawn(
+                                async move {
+                                    let freeze_and_flush = true;
+
+                                    let res = {
+                                        let (_guard, shutdown_progress) = completion::channel();
+                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                    };
+
+                                    if let Err(other_progress) = res {
+                                        // join the another shutdown in progress
+                                        other_progress.wait().await;
+                                    }
+
+                                    // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                                    // going to log too many lines
+                                    debug!("tenant successfully stopped");
+                                }
+                                .instrument(info_span!("shutdown", %tenant_id)),
+                            );
+
+                            total_attached += 1;
                         }
                         TenantSlot::Secondary => {
-                            shutdown_state.insert(k, TenantSlot::Secondary);
+                            shutdown_state.insert(tenant_id, TenantSlot::Secondary);
                         }
                         TenantSlot::InProgress(notify) => {
                             // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
                             // wait for their notifications to fire in this function.
-                            in_progress_ops.push(notify);
+                            join_set.spawn(async move {
+                                notify.wait().await;
+                            });
+
+                            total_in_progress += 1;
                         }
                     }
                 }
                 *m = TenantsMap::ShuttingDown(shutdown_state);
-                (in_progress_ops, tenants_to_shut_down)
+                (total_in_progress, total_attached)
             }
             TenantsMap::ShuttingDown(_) => {
-                // TODO: it is possible that detach and shutdown happen at the same time. as a
-                // result, during shutdown we do not wait for detach.
                 error!("already shutting down, this function isn't supposed to be called more than once");
                 return;
             }
         }
     };
 
+    let started_at = std::time::Instant::now();
+
     info!(
         "Waiting for {} InProgress tenants and {} Attached tenants to shut down",
-        in_progress_ops.len(),
-        tenants_to_shut_down.len()
+        total_in_progress, total_attached
     );
 
-    for barrier in in_progress_ops {
-        barrier.wait().await;
-    }
-
-    info!(
-        "InProgress tenants shut down, waiting for {} Attached tenants to shut down",
-        tenants_to_shut_down.len()
-    );
-    let started_at = std::time::Instant::now();
-    let mut join_set = JoinSet::new();
-    for tenant in tenants_to_shut_down {
-        let tenant_id = tenant.get_tenant_id();
-        join_set.spawn(
-            async move {
-                let freeze_and_flush = true;
-
-                let res = {
-                    let (_guard, shutdown_progress) = completion::channel();
-                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                };
-
-                if let Err(other_progress) = res {
-                    // join the another shutdown in progress
-                    other_progress.wait().await;
-                }
-
-                // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                // going to log too many lines
-
-                debug!("tenant successfully stopped");
-            }
-            .instrument(info_span!("shutdown", %tenant_id)),
-        );
-    }
-
     let total = join_set.len();
     let mut panicked = 0;
     let mut buffering = true;
@@ -661,7 +652,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                 match joined {
                     Ok(()) => {}
                     Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the futures");
+                        unreachable!("we are not cancelling any of the tasks");
                     }
                     Err(join_error) if join_error.is_panic() => {
                         // cannot really do anything, as this panic is likely a bug

From 8e5e3971ba4f05f17b17fd3471b9eb839a4da0ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 10 Nov 2023 13:38:44 +0100
Subject: [PATCH 29/36] find_lsn_for_timestamp fixes (#5844)

Includes the changes of #3689 that address point 1 of #3689, plus some
further improvements. In particular, this PR does:

* set `min_lsn` to a safe value to create branches from (and verify it
in tests)
* return `min_lsn` instead of `max_lsn` for `NoData` and `Past` (verify
it in test for `Past`, `NoData` is harder and not as important)
* return `commit_lsn` instead of `max_lsn` for Future (and verify it in
the tests)
* add some comments

Split out of #5686 to get something more minimal out to users.
---
 pageserver/src/pgdatadir_mapping.rs     | 62 ++++++++++++++++++-------
 test_runner/regress/test_lsn_mapping.py | 62 +++++++++++++++++++++----
 2 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 9e8a6b02cc..09feba9b68 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -29,9 +29,33 @@ pub type BlockNumber = u32;
 
 #[derive(Debug)]
 pub enum LsnForTimestamp {
+    /// Found commits both before and after the given timestamp
     Present(Lsn),
+
+    /// Found no commits after the given timestamp, this means
+    /// that the newest data in the branch is older than the given
+    /// timestamp.
+    ///
+    /// All commits <= LSN happened before the given timestamp
     Future(Lsn),
+
+    /// The queried timestamp is past our horizon we look back at (PITR)
+    ///
+    /// All commits > LSN happened after the given timestamp,
+    /// but any commits < LSN might have happened before or after
+    /// the given timestamp. We don't know because no data before
+    /// the given lsn is available.
     Past(Lsn),
+
+    /// We have found no commit with a timestamp,
+    /// so we can't return anything meaningful.
+    ///
+    /// The associated LSN is the lower bound value we can safely
+    /// create branches on, but no statement is made if it is
+    /// older or newer than the timestamp.
+    ///
+    /// This variant can e.g. be returned right after a
+    /// cluster import.
     NoData(Lsn),
 }
 
@@ -324,7 +348,11 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
-        let min_lsn = *gc_cutoff_lsn_guard;
+        // We use this method to figure out the branching LSN for the new branch, but the
+        // GC cutoff could be before the branching point and we cannot create a new branch
+        // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
+        // on the safe side.
+        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
         let max_lsn = self.get_last_record_lsn();
 
         // LSNs are always 8-byte aligned. low/mid/high represent the
@@ -354,30 +382,32 @@ impl Timeline {
                 low = mid + 1;
             }
         }
+        // If `found_smaller == true`, `low` is the LSN of the last commit record
+        // before or at `search_timestamp`
+        //
+        // FIXME: it would be better to get the LSN of the previous commit.
+        // Otherwise, if you restore to the returned LSN, the database will
+        // include physical changes from later commits that will be marked
+        // as aborted, and will need to be vacuumed away.
+        let commit_lsn = Lsn((low - 1) * 8);
         match (found_smaller, found_larger) {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.
                 // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
-            }
-            (true, false) => {
-                // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                Ok(LsnForTimestamp::NoData(min_lsn))
             }
             (false, true) => {
                 // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                Ok(LsnForTimestamp::Past(min_lsn))
             }
-            (true, true) => {
-                // low is the LSN of the first commit record *after* the search_timestamp,
-                // Back off by one to get to the point just before the commit.
-                //
-                // FIXME: it would be better to get the LSN of the previous commit.
-                // Otherwise, if you restore to the returned LSN, the database will
-                // include physical changes from later commits that will be marked
-                // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+            (true, false) => {
+                // Only found commits with timestamps smaller than the request.
+                // It's still a valid case for branch creation, return it.
+                // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
+                // case, anyway.
+                Ok(LsnForTimestamp::Future(commit_lsn))
             }
+            (true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
         }
     }
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 726bfa5f29..f79c1c347c 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -79,13 +79,32 @@ def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
 def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
-    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
-    log.info("postgres is running on 'test_lsn_mapping' branch")
+    tenant_id, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable default GC and compaction
+            "gc_period": "1000 m",
+            "compaction_period": "0 s",
+            "gc_horizon": f"{1024 ** 2}",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "compaction_target_size": f"{1024 ** 2}",
+        }
+    )
+
+    timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id)
+    endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
+    timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
+    log.info("postgres is running on 'main' branch")
 
     cur = endpoint_main.connect().cursor()
+
+    # Obtain an lsn before all write operations on this branch
+    start_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_lsn()"))
+
     # Create table, and insert rows, each in a separate transaction
     # Disable synchronous_commit to make this initialization go faster.
+    # Disable `synchronous_commit` to make this initialization go faster.
+    # XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off`
+    #      doesn't change anything.
     #
     # Each row contains current insert LSN and the current timestamp, when
     # the row was inserted.
@@ -104,40 +123,63 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
     cur.execute("INSERT INTO foo VALUES (-1)")
 
     # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint_main, tenant_id, timeline_id)
 
     with env.pageserver.http_client() as client:
-        # Check edge cases: timestamp in the future
+        # Check edge cases
+        # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
         result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
         )
         assert result["kind"] == "future"
+        # make sure that we return a well advanced lsn here
+        assert Lsn(result["lsn"]) > start_lsn
 
-        # timestamp too the far history
+        # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
         result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
         )
         assert result["kind"] == "past"
+        # make sure that we return the minimum lsn here at the start of the range
+        assert Lsn(result["lsn"]) < start_lsn
 
         # Probe a bunch of timestamps in the valid range
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
             result = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
             )
+            assert result["kind"] not in ["past", "nodata"]
             lsn = result["lsn"]
             # Call get_lsn_by_timestamp to get the LSN
             # Launch a new read-only node at that LSN, and check that only the rows
             # that were supposed to be committed at that point in time are visible.
             endpoint_here = env.endpoints.create_start(
-                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
+                branch_name="test_lsn_mapping",
+                endpoint_id="ep-lsn_mapping_read",
+                lsn=lsn,
+                tenant_id=tenant_id,
             )
             assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
 
             endpoint_here.stop_and_destroy()
 
+        # Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff
+        timeline_id_child = env.neon_cli.create_branch(
+            "test_lsn_mapping_child", tenant_id=tenant_id, ancestor_branch_name="test_lsn_mapping"
+        )
+
+        # Timestamp is in the unreachable past
+        probe_timestamp = tbl[0][1] - timedelta(hours=10)
+        result = client.timeline_get_lsn_by_timestamp(
+            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
+        )
+        assert result["kind"] == "past"
+        # make sure that we return the minimum lsn here at the start of the range
+        assert Lsn(result["lsn"]) >= last_flush_lsn
+
 
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):

From 6e145a44fa6934021326448fee610a96ebbffcd2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 10 Nov 2023 12:45:41 +0000
Subject: [PATCH 30/36] workflows/neon_extra_builds: run check-codestyle-rust &
 build-neon on arm64 (#5832)

## Problem

Some developers use workstations with arm CPUs, and sometimes x86-64
code is not fully compatible with it (for example,
https://github.com/neondatabase/neon/pull/5827).
Although we don't have arm CPUs in the prod (yet?), it is worth having
some basic checks for this architecture to have a better developer
experience.

Closes https://github.com/neondatabase/neon/issues/5829

## Summary of changes
- Run `check-codestyle-rust`-like & `build-neon`-like jobs on Arm runner
- Add `run-extra-build-*` label to run all available extra builds
---
 .github/actionlint.yml                  |   2 +
 .github/workflows/neon_extra_builds.yml | 181 +++++++++++++++++++++++-
 2 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index e2ece5f230..362480f256 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,5 +1,7 @@
 self-hosted-runner:
   labels:
+    - arm64
+    - dev
     - gen3
     - large
     - small
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index d7f5295c5b..0d7db8dfbc 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:
 
 jobs:
   check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
     timeout-minutes: 90
     runs-on: macos-latest
 
@@ -112,8 +115,182 @@ jobs:
       - name: Check that no warnings are produced
         run: ./run_clippy.sh
 
+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
   gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned

From 71b380f90a8f9b5813097a6bf961ad9cb73c416c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 10 Nov 2023 12:49:52 +0000
Subject: [PATCH 31/36] Set BUILD_TAG for build-neon job (#5847)

## Problem

I've added `BUILD_TAG` to docker images.
(https://github.com/neondatabase/neon/pull/5812), but forgot to add it
to services that we build for tests

## Summary of changes
- Set `BUILD_TAG` in `build-neon` job
---
 .github/workflows/build_and_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index babf767fcf..6068177245 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -175,7 +175,7 @@ jobs:
         run: cargo deny check
 
   build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
     env:
       BUILD_TYPE: ${{ matrix.build_type }}
       GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
 
     steps:
       - name: Fix git ownership

From a6f892e200ef7bd548b640b5db430599f54d902e Mon Sep 17 00:00:00 2001
From: Rahul Modpur <rmodpur2@gmail.com>
Date: Fri, 10 Nov 2023 18:35:22 +0530
Subject: [PATCH 32/36] metric: add started and killed walredo processes
 counter (#5809)

In OOM situations, knowing exactly how many walredo processes there were
at a time would help afterwards to understand why was pageserver OOM
killed. Add `pageserver_wal_redo_process_total` metric to keep track of
total wal redo process started, shutdown and killed since pageserver
start.

Closes #5722

---------

Signed-off-by: Rahul Modpur <rmodpur2@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <me@cschwarz.com>
---
 pageserver/src/metrics.rs | 40 +++++++++++++++++++++++++++++++++++++++
 pageserver/src/walredo.rs | 22 ++++++++++++---------
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3d11daed96..4b52f07326 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1252,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+pub(crate) struct WalRedoProcessCounters {
+    pub(crate) started: IntCounter,
+    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+}
+
+#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub(crate) enum WalRedoKillCause {
+    WalRedoProcessDrop,
+    NoLeakChildDrop,
+    Startup,
+}
+
+impl Default for WalRedoProcessCounters {
+    fn default() -> Self {
+        let started = register_int_counter!(
+            "pageserver_wal_redo_process_started_total",
+            "Number of WAL redo processes started",
+        )
+        .unwrap();
+
+        let killed = register_int_counter_vec!(
+            "pageserver_wal_redo_process_stopped_total",
+            "Number of WAL redo processes stopped",
+            &["cause"],
+        )
+        .unwrap();
+        Self {
+            started,
+            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
+                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause_str: &'static str = cause.into();
+                killed.with_label_values(&[cause_str])
+            })),
+        }
+    }
+}
+
+pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
+    Lazy::new(WalRedoProcessCounters::default);
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 4a9524b5e4..ccdf621c30 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,7 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 
 use crate::config::PageServerConf;
 use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -662,10 +663,10 @@ impl WalRedoProcess {
             .close_fds()
             .spawn_no_leak_child(tenant_id)
             .context("spawn process")?;
-
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
         let mut child = scopeguard::guard(child, |child| {
             error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait();
+            child.kill_and_wait(WalRedoKillCause::Startup);
         });
 
         let stdin = child.stdin.take().unwrap();
@@ -996,7 +997,7 @@ impl Drop for WalRedoProcess {
         self.child
             .take()
             .expect("we only do this once")
-            .kill_and_wait();
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
         self.stderr_logger_cancel.cancel();
         // no way to wait for stderr_logger_task from Drop because that is async only
     }
@@ -1032,16 +1033,19 @@ impl NoLeakChild {
         })
     }
 
-    fn kill_and_wait(mut self) {
+    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
         let child = match self.child.take() {
             Some(child) => child,
             None => return,
         };
-        Self::kill_and_wait_impl(child);
+        Self::kill_and_wait_impl(child, cause);
     }
 
-    #[instrument(skip_all, fields(pid=child.id()))]
-    fn kill_and_wait_impl(mut child: Child) {
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
         let res = child.kill();
         if let Err(e) = res {
             // This branch is very unlikely because:
@@ -1086,7 +1090,7 @@ impl Drop for NoLeakChild {
                 // This thread here is going to outlive of our dropper.
                 let span = tracing::info_span!("walredo", %tenant_id);
                 let _entered = span.enter();
-                Self::kill_and_wait_impl(child);
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
             })
             .await
         });

From d672e44eee7740277b670fc7e4667c9f0ea04355 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 10 Nov 2023 13:58:18 +0000
Subject: [PATCH 33/36] pageserver: error type for collect_keyspace (#5846)

## Problem

This is a log hygiene fix, for an occasional test failure.

warn-level logging in imitate_timeline_cached_layer_accesses can't
distinguish actual errors from shutdown cases.

## Summary of changes

Replaced anyhow::Error with an explicit CollectKeySpaceError type, that
includes conversion from PageReconstructError::Cancelled.
---
 pageserver/src/http/routes.rs                 |  2 +-
 pageserver/src/pgdatadir_mapping.rs           | 30 +++++++++++++++----
 .../src/tenant/timeline/eviction_task.rs      | 14 +++++++--
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 63016042cf..2915178104 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1478,7 +1478,7 @@ async fn timeline_collect_keyspace(
         let keys = timeline
             .collect_keyspace(at_lsn, &ctx)
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
         json_response(StatusCode::OK, Partitioning { keys, at_lsn })
     }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 09feba9b68..abdb4a8379 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -22,6 +22,7 @@ use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use tracing::{debug, trace, warn};
+use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
 /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
@@ -67,6 +68,25 @@ pub enum CalculateLogicalSizeError {
     Other(#[from] anyhow::Error),
 }
 
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum CollectKeySpaceError {
+    #[error(transparent)]
+    Decode(#[from] DeserializeError),
+    #[error(transparent)]
+    PageRead(PageReconstructError),
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl From<PageReconstructError> for CollectKeySpaceError {
+    fn from(err: PageReconstructError) -> Self {
+        match err {
+            PageReconstructError::Cancelled => Self::Cancelled,
+            err => Self::PageRead(err),
+        }
+    }
+}
+
 impl From<PageReconstructError> for CalculateLogicalSizeError {
     fn from(pre: PageReconstructError) -> Self {
         match pre {
@@ -635,11 +655,11 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub async fn collect_keyspace(
+    pub(crate) async fn collect_keyspace(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> anyhow::Result<KeySpace> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -648,7 +668,7 @@ impl Timeline {
 
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
+        let dbdir = DbDirectory::des(&buf)?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
         dbs.sort_unstable();
@@ -681,7 +701,7 @@ impl Timeline {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
             let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
+            let dir = SlruSegmentDirectory::des(&buf)?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
@@ -699,7 +719,7 @@ impl Timeline {
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
         let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
         for xid in xids {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 183fcb872f..79bc434a2a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
 
 use crate::{
     context::{DownloadBehavior, RequestContext},
+    pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -397,9 +398,16 @@ impl Timeline {
             if size.is_err() {
                 // ignore, see above comment
             } else {
-                warn!(
-                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
-                );
+                match e {
+                    CollectKeySpaceError::Cancelled => {
+                        // Shutting down, ignore
+                    }
+                    err => {
+                        warn!(
+                            "failed to collect keyspace but succeeded in calculating logical size: {err:#}"
+                        );
+                    }
+                }
             }
         }
     }

From a05f104cce70fbd7330704662108ed8934408328 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 Nov 2023 16:05:21 +0200
Subject: [PATCH 34/36] build: remove async-std dependency (#5848)

Introduced by accident (missing `default-features = false`) in
e09d5ada6a. We directly need only `http_types::StatusCode`.
---
 Cargo.lock                | 597 +++-----------------------------------
 Cargo.toml                |   2 +-
 workspace_hack/Cargo.toml |   6 +-
 3 files changed, 50 insertions(+), 555 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 738771f88b..44ea46efda 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,60 +23,6 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
-[[package]]
-name = "aead"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fc95d1bdb8e6666b2b217308eeeb09f2d6728d104be3e31916cc74d15420331"
-dependencies = [
- "generic-array",
-]
-
-[[package]]
-name = "aes"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "884391ef1066acaa41e766ba8f596341b96e93ce34f9a43e7d24bf0a0eaf0561"
-dependencies = [
- "aes-soft",
- "aesni",
- "cipher",
-]
-
-[[package]]
-name = "aes-gcm"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5278b5fabbb9bd46e24aa69b2fdea62c99088e0a950a9be40e3e0101298f88da"
-dependencies = [
- "aead",
- "aes",
- "cipher",
- "ctr",
- "ghash",
- "subtle",
-]
-
-[[package]]
-name = "aes-soft"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be14c7498ea50828a38d0e24a765ed2effe92a705885b57d029cd67d45744072"
-dependencies = [
- "cipher",
- "opaque-debug",
-]
-
-[[package]]
-name = "aesni"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea2e11f5e94c2f7d386164cc2aa1f97823fed6f259e486940a71c174dd01b0ce"
-dependencies = [
- "cipher",
- "opaque-debug",
-]
-
 [[package]]
 name = "ahash"
 version = "0.8.3"
@@ -198,7 +144,7 @@ dependencies = [
  "num-traits",
  "rusticata-macros",
  "thiserror",
- "time 0.3.21",
+ "time",
 ]
 
 [[package]]
@@ -248,55 +194,6 @@ dependencies = [
  "tokio",
 ]
 
-[[package]]
-name = "async-executor"
-version = "1.5.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c1da3ae8dabd9c00f453a329dfe1fb28da3c0a72e2478cdcd93171740c20499"
-dependencies = [
- "async-lock",
- "async-task",
- "concurrent-queue",
- "fastrand 2.0.0",
- "futures-lite",
- "slab",
-]
-
-[[package]]
-name = "async-global-executor"
-version = "2.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776"
-dependencies = [
- "async-channel",
- "async-executor",
- "async-io",
- "async-lock",
- "blocking",
- "futures-lite",
- "once_cell",
-]
-
-[[package]]
-name = "async-io"
-version = "1.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af"
-dependencies = [
- "async-lock",
- "autocfg",
- "cfg-if",
- "concurrent-queue",
- "futures-lite",
- "log",
- "parking",
- "polling",
- "rustix 0.37.25",
- "slab",
- "socket2 0.4.9",
- "waker-fn",
-]
-
 [[package]]
 name = "async-lock"
 version = "2.8.0"
@@ -306,32 +203,6 @@ dependencies = [
  "event-listener",
 ]
 
-[[package]]
-name = "async-std"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d"
-dependencies = [
- "async-channel",
- "async-global-executor",
- "async-io",
- "async-lock",
- "crossbeam-utils",
- "futures-channel",
- "futures-core",
- "futures-io",
- "futures-lite",
- "gloo-timers",
- "kv-log-macro",
- "log",
- "memchr",
- "once_cell",
- "pin-project-lite",
- "pin-utils",
- "slab",
- "wasm-bindgen-futures",
-]
-
 [[package]]
 name = "async-stream"
 version = "0.3.5"
@@ -354,12 +225,6 @@ dependencies = [
  "syn 2.0.28",
 ]
 
-[[package]]
-name = "async-task"
-version = "4.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9441c6b2fe128a7c2bf680a44c34d0df31ce09e5b7e401fcca3faa483dbc921"
-
 [[package]]
 name = "async-trait"
 version = "0.1.68"
@@ -380,12 +245,6 @@ dependencies = [
  "critical-section",
 ]
 
-[[package]]
-name = "atomic-waker"
-version = "1.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -415,7 +274,7 @@ dependencies = [
  "http",
  "hyper",
  "ring",
- "time 0.3.21",
+ "time",
  "tokio",
  "tower",
  "tracing",
@@ -568,13 +427,13 @@ dependencies = [
  "bytes",
  "form_urlencoded",
  "hex",
- "hmac 0.12.1",
+ "hmac",
  "http",
  "once_cell",
  "percent-encoding",
  "regex",
- "sha2 0.10.6",
- "time 0.3.21",
+ "sha2",
+ "time",
  "tracing",
 ]
 
@@ -606,8 +465,8 @@ dependencies = [
  "http-body",
  "md-5",
  "pin-project-lite",
- "sha1 0.10.5",
- "sha2 0.10.6",
+ "sha1",
+ "sha2",
  "tracing",
 ]
 
@@ -752,7 +611,7 @@ dependencies = [
  "num-integer",
  "ryu",
  "serde",
- "time 0.3.21",
+ "time",
 ]
 
 [[package]]
@@ -776,7 +635,7 @@ dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
  "http",
- "rustc_version 0.4.0",
+ "rustc_version",
  "tracing",
 ]
 
@@ -806,7 +665,7 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_urlencoded",
- "sha1 0.10.5",
+ "sha1",
  "sync_wrapper",
  "tokio",
  "tokio-tungstenite",
@@ -851,10 +710,10 @@ dependencies = [
  "quick-xml",
  "rand 0.8.5",
  "reqwest",
- "rustc_version 0.4.0",
+ "rustc_version",
  "serde",
  "serde_json",
- "time 0.3.21",
+ "time",
  "url",
  "uuid",
 ]
@@ -874,7 +733,7 @@ dependencies = [
  "pin-project",
  "serde",
  "serde_json",
- "time 0.3.21",
+ "time",
  "tz-rs",
  "url",
  "uuid",
@@ -891,13 +750,13 @@ dependencies = [
  "azure_core",
  "bytes",
  "futures",
- "hmac 0.12.1",
+ "hmac",
  "log",
  "serde",
  "serde_derive",
  "serde_json",
- "sha2 0.10.6",
- "time 0.3.21",
+ "sha2",
+ "time",
  "url",
  "uuid",
 ]
@@ -917,7 +776,7 @@ dependencies = [
  "serde",
  "serde_derive",
  "serde_json",
- "time 0.3.21",
+ "time",
  "url",
  "uuid",
 ]
@@ -937,12 +796,6 @@ dependencies = [
  "rustc-demangle",
 ]
 
-[[package]]
-name = "base-x"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270"
-
 [[package]]
 name = "base64"
 version = "0.13.1"
@@ -1015,15 +868,6 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
-[[package]]
-name = "block-buffer"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
-dependencies = [
- "generic-array",
-]
-
 [[package]]
 name = "block-buffer"
 version = "0.10.4"
@@ -1033,22 +877,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "blocking"
-version = "1.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c36a4d0d48574b3dd360b4b7d95cc651d2b6557b6402848a27d4b228a473e2a"
-dependencies = [
- "async-channel",
- "async-lock",
- "async-task",
- "fastrand 2.0.0",
- "futures-io",
- "futures-lite",
- "piper",
- "tracing",
-]
-
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -1193,15 +1021,6 @@ dependencies = [
  "half",
 ]
 
-[[package]]
-name = "cipher"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12f8e7987cbd042a63249497f41aed09f8e65add917ea6566effbc56578d6801"
-dependencies = [
- "generic-array",
-]
-
 [[package]]
 name = "clang-sys"
 version = "1.6.1"
@@ -1419,23 +1238,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "cookie"
-version = "0.14.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03a5d7b21829bc7b4bf4754a978a241ae54ea55a40f92bb20216e54096f4b951"
-dependencies = [
- "aes-gcm",
- "base64 0.13.1",
- "hkdf",
- "hmac 0.10.1",
- "percent-encoding",
- "rand 0.8.5",
- "sha2 0.9.9",
- "time 0.2.27",
- "version_check",
-]
-
 [[package]]
 name = "core-foundation"
 version = "0.9.3"
@@ -1461,19 +1263,13 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "cpuid-bool"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcb25d077389e53838a8158c8e99174c5a9d902dee4904320db714f3c653ffba"
-
 [[package]]
 name = "crc32c"
 version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
 dependencies = [
- "rustc_version 0.4.0",
+ "rustc_version",
 ]
 
 [[package]]
@@ -1605,25 +1401,6 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "crypto-mac"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6"
-dependencies = [
- "generic-array",
- "subtle",
-]
-
-[[package]]
-name = "ctr"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb4a30d54f7443bf3d6191dcd486aca19e67cb3c49fa7a06a319966346707e7f"
-dependencies = [
- "cipher",
-]
-
 [[package]]
 name = "darling"
 version = "0.20.1"
@@ -1702,32 +1479,17 @@ dependencies = [
  "rusticata-macros",
 ]
 
-[[package]]
-name = "digest"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
-dependencies = [
- "generic-array",
-]
-
 [[package]]
 name = "digest"
 version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
- "block-buffer 0.10.4",
+ "block-buffer",
  "crypto-common",
  "subtle",
 ]
 
-[[package]]
-name = "discard"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0"
-
 [[package]]
 name = "displaydoc"
 version = "0.2.4"
@@ -2094,16 +1856,6 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "ghash"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97304e4cd182c3846f7575ced3890c53012ce534ad9114046b0a9e00bb30a375"
-dependencies = [
- "opaque-debug",
- "polyval",
-]
-
 [[package]]
 name = "gimli"
 version = "0.27.2"
@@ -2138,18 +1890,6 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
-[[package]]
-name = "gloo-timers"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c"
-dependencies = [
- "futures-channel",
- "futures-core",
- "js-sys",
- "wasm-bindgen",
-]
-
 [[package]]
 name = "h2"
 version = "0.3.19"
@@ -2221,7 +1961,7 @@ source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963b
 dependencies = [
  "atomic-polyfill",
  "hash32",
- "rustc_version 0.4.0",
+ "rustc_version",
  "spin 0.9.8",
  "stable_deref_trait",
 ]
@@ -2263,33 +2003,13 @@ dependencies = [
  "thiserror",
 ]
 
-[[package]]
-name = "hkdf"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51ab2f639c231793c5f6114bdb9bbe50a7dbbfcd7c7c6bd8475dec2d991e964f"
-dependencies = [
- "digest 0.9.0",
- "hmac 0.10.1",
-]
-
-[[package]]
-name = "hmac"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15"
-dependencies = [
- "crypto-mac",
- "digest 0.9.0",
-]
-
 [[package]]
 name = "hmac"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
 dependencies = [
- "digest 0.10.7",
+ "digest",
 ]
 
 [[package]]
@@ -2333,9 +2053,7 @@ checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad"
 dependencies = [
  "anyhow",
  "async-channel",
- "async-std",
  "base64 0.13.1",
- "cookie",
  "futures-lite",
  "infer",
  "pin-project-lite",
@@ -2649,15 +2367,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "kv-log-macro"
-version = "1.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -2713,9 +2422,6 @@ name = "log"
 version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
-dependencies = [
- "value-bag",
-]
 
 [[package]]
 name = "match_cfg"
@@ -2744,7 +2450,7 @@ version = "0.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca"
 dependencies = [
- "digest 0.10.7",
+ "digest",
 ]
 
 [[package]]
@@ -2990,7 +2696,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_path_to_error",
- "sha2 0.10.6",
+ "sha2",
  "thiserror",
  "url",
 ]
@@ -3025,12 +2731,6 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
-[[package]]
-name = "opaque-debug"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
-
 [[package]]
 name = "openssl"
 version = "0.10.55"
@@ -3384,10 +3084,10 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
- "digest 0.10.7",
- "hmac 0.12.1",
+ "digest",
+ "hmac",
  "password-hash",
- "sha2 0.10.6",
+ "sha2",
 ]
 
 [[package]]
@@ -3481,17 +3181,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "piper"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "668d31b1c4eba19242f2088b2bf3316b82ca31082a8335764db4e083db7485d4"
-dependencies = [
- "atomic-waker",
- "fastrand 2.0.0",
- "futures-io",
-]
-
 [[package]]
 name = "pkg-config"
 version = "0.3.27"
@@ -3526,33 +3215,6 @@ dependencies = [
  "plotters-backend",
 ]
 
-[[package]]
-name = "polling"
-version = "2.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce"
-dependencies = [
- "autocfg",
- "bitflags",
- "cfg-if",
- "concurrent-queue",
- "libc",
- "log",
- "pin-project-lite",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "polyval"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
-dependencies = [
- "cpuid-bool",
- "opaque-debug",
- "universal-hash",
-]
-
 [[package]]
 name = "postgres"
 version = "0.19.4"
@@ -3586,12 +3248,12 @@ dependencies = [
  "byteorder",
  "bytes",
  "fallible-iterator",
- "hmac 0.12.1",
+ "hmac",
  "lazy_static",
  "md-5",
  "memchr",
  "rand 0.8.5",
- "sha2 0.10.6",
+ "sha2",
  "stringprep",
 ]
 
@@ -3820,7 +3482,7 @@ dependencies = [
  "hashbrown 0.13.2",
  "hashlink",
  "hex",
- "hmac 0.12.1",
+ "hmac",
  "hostname",
  "humantime",
  "hyper",
@@ -3853,7 +3515,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
- "sha2 0.10.6",
+ "sha2",
  "socket2 0.5.3",
  "sync_wrapper",
  "thiserror",
@@ -3995,7 +3657,7 @@ checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976"
 dependencies = [
  "pem 2.0.1",
  "ring",
- "time 0.3.21",
+ "time",
  "yasna",
 ]
 
@@ -4252,7 +3914,7 @@ dependencies = [
  "futures",
  "futures-timer",
  "rstest_macros",
- "rustc_version 0.4.0",
+ "rustc_version",
 ]
 
 [[package]]
@@ -4267,7 +3929,7 @@ dependencies = [
  "quote",
  "regex",
  "relative-path",
- "rustc_version 0.4.0",
+ "rustc_version",
  "syn 2.0.28",
  "unicode-ident",
 ]
@@ -4284,22 +3946,13 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
-[[package]]
-name = "rustc_version"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
-dependencies = [
- "semver 0.9.0",
-]
-
 [[package]]
 name = "rustc_version"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
 dependencies = [
- "semver 1.0.17",
+ "semver",
 ]
 
 [[package]]
@@ -4561,27 +4214,12 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "semver"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
-dependencies = [
- "semver-parser",
-]
-
 [[package]]
 name = "semver"
 version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
-[[package]]
-name = "semver-parser"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
-
 [[package]]
 name = "sentry"
 version = "0.31.6"
@@ -4622,7 +4260,7 @@ dependencies = [
  "hostname",
  "libc",
  "os_info",
- "rustc_version 0.4.0",
+ "rustc_version",
  "sentry-core",
  "uname",
 ]
@@ -4674,7 +4312,7 @@ dependencies = [
  "serde",
  "serde_json",
  "thiserror",
- "time 0.3.21",
+ "time",
  "url",
  "uuid",
 ]
@@ -4775,7 +4413,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with_macros",
- "time 0.3.21",
+ "time",
 ]
 
 [[package]]
@@ -4790,15 +4428,6 @@ dependencies = [
  "syn 2.0.28",
 ]
 
-[[package]]
-name = "sha1"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1da05c97445caa12d05e848c4a4fcbbea29e748ac28f7e80e9b010392063770"
-dependencies = [
- "sha1_smol",
-]
-
 [[package]]
 name = "sha1"
 version = "0.10.5"
@@ -4807,26 +4436,7 @@ checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
 dependencies = [
  "cfg-if",
  "cpufeatures",
- "digest 0.10.7",
-]
-
-[[package]]
-name = "sha1_smol"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
-
-[[package]]
-name = "sha2"
-version = "0.9.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800"
-dependencies = [
- "block-buffer 0.9.0",
- "cfg-if",
- "cpufeatures",
- "digest 0.9.0",
- "opaque-debug",
+ "digest",
 ]
 
 [[package]]
@@ -4837,7 +4447,7 @@ checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
 dependencies = [
  "cfg-if",
  "cpufeatures",
- "digest 0.10.7",
+ "digest",
 ]
 
 [[package]]
@@ -4894,7 +4504,7 @@ dependencies = [
  "num-bigint",
  "num-traits",
  "thiserror",
- "time 0.3.21",
+ "time",
 ]
 
 [[package]]
@@ -4959,70 +4569,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
 
-[[package]]
-name = "standback"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff"
-dependencies = [
- "version_check",
-]
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
-[[package]]
-name = "stdweb"
-version = "0.4.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5"
-dependencies = [
- "discard",
- "rustc_version 0.2.3",
- "stdweb-derive",
- "stdweb-internal-macros",
- "stdweb-internal-runtime",
- "wasm-bindgen",
-]
-
-[[package]]
-name = "stdweb-derive"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef"
-dependencies = [
- "proc-macro2",
- "quote",
- "serde",
- "serde_derive",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "stdweb-internal-macros"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11"
-dependencies = [
- "base-x",
- "proc-macro2",
- "quote",
- "serde",
- "serde_derive",
- "serde_json",
- "sha1 0.6.1",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "stdweb-internal-runtime"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
-
 [[package]]
 name = "storage_broker"
 version = "0.1.0"
@@ -5256,21 +4808,6 @@ dependencies = [
  "once_cell",
 ]
 
-[[package]]
-name = "time"
-version = "0.2.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242"
-dependencies = [
- "const_fn",
- "libc",
- "standback",
- "stdweb",
- "time-macros 0.1.1",
- "version_check",
- "winapi",
-]
-
 [[package]]
 name = "time"
 version = "0.3.21"
@@ -5283,7 +4820,7 @@ dependencies = [
  "num_threads",
  "serde",
  "time-core",
- "time-macros 0.2.9",
+ "time-macros",
 ]
 
 [[package]]
@@ -5292,16 +4829,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
 
-[[package]]
-name = "time-macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1"
-dependencies = [
- "proc-macro-hack",
- "time-macros-impl",
-]
-
 [[package]]
 name = "time-macros"
 version = "0.2.9"
@@ -5311,19 +4838,6 @@ dependencies = [
  "time-core",
 ]
 
-[[package]]
-name = "time-macros-impl"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f"
-dependencies = [
- "proc-macro-hack",
- "proc-macro2",
- "quote",
- "standback",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -5685,7 +5199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e"
 dependencies = [
  "crossbeam-channel",
- "time 0.3.21",
+ "time",
  "tracing-subscriber",
 ]
 
@@ -5820,7 +5334,7 @@ dependencies = [
  "httparse",
  "log",
  "rand 0.8.5",
- "sha1 0.10.5",
+ "sha1",
  "thiserror",
  "url",
  "utf-8",
@@ -5892,16 +5406,6 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
 
-[[package]]
-name = "universal-hash"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8326b2c654932e3e4f9196e69d08fdf7cfd718e1dc6f66b347e6024a0c961402"
-dependencies = [
- "generic-array",
- "subtle",
-]
-
 [[package]]
 name = "untrusted"
 version = "0.7.1"
@@ -6019,12 +5523,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "value-bag"
-version = "1.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"
-
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -6514,11 +6012,10 @@ dependencies = [
  "serde",
  "serde_json",
  "smallvec",
- "standback",
  "syn 1.0.109",
  "syn 2.0.28",
- "time 0.3.21",
- "time-macros 0.2.9",
+ "time",
+ "time-macros",
  "tokio",
  "tokio-rustls",
  "tokio-util",
@@ -6546,7 +6043,7 @@ dependencies = [
  "oid-registry",
  "rusticata-macros",
  "thiserror",
- "time 0.3.21",
+ "time",
 ]
 
 [[package]]
@@ -6570,7 +6067,7 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd"
 dependencies = [
- "time 0.3.21",
+ "time",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index e528489f1e..363d4c6fe4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,7 +83,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a088f1868b..3535c2cf94 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,7 +39,7 @@ hex = { version = "0.4", features = ["serde"] }
 hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -56,7 +56,6 @@ scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
@@ -77,14 +76,13 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }

From b7f45204a28f69e344e74d653b68c7011bc4a6da Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 Nov 2023 19:02:22 +0200
Subject: [PATCH 35/36] build: deny async-std and friends (#5849)

rationale: some crates pull these in as default; hopefully these hints
will require less cleanup-after and Cargo.lock file watching.

follow-up to #5848.
---
 .github/workflows/build_and_test.yml |  2 +-
 deny.toml                            | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6068177245..ff7d8c1a62 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,7 +172,7 @@ jobs:
       # https://github.com/EmbarkStudios/cargo-deny
       - name: Check rust licenses/bans/advisories/sources
         if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph
 
   build-neon:
     needs: [ check-permissions, tag ]
diff --git a/deny.toml b/deny.toml
index f4ea0d4dac..079dcac679 100644
--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []
 
+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html

From 74d150ba45cc230e2403cc7a016230e01334972f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 Nov 2023 21:10:54 +0200
Subject: [PATCH 36/36] build: upgrade ahash (#5851)

`cargo deny` was complaining the version 0.8.3 was yanked (for possible
DoS attack [wiki]), but the latest version (0.8.5) also includes aarch64
fixes which may or may not be relevant. Our usage of ahash limits to
proxy, but I don't think we are at any risk.

[wiki]: https://github.com/tkaitchuck/aHash/wiki/Yanked-versions
---
 Cargo.lock | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 44ea46efda..4cada013d7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,13 +25,14 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
+checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
  "cfg-if",
  "once_cell",
  "version_check",
+ "zerocopy",
 ]
 
 [[package]]
@@ -6070,6 +6071,26 @@ dependencies = [
  "time",
 ]
 
+[[package]]
+name = "zerocopy"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.6.0"