From 33cb9a68f7dde14a851ff5893f7107b4fc53856e Mon Sep 17 00:00:00 2001
From: John Khvatov <ivaxer@yandex-team.com>
Date: Mon, 18 Dec 2023 16:33:23 +0300
Subject: [PATCH 01/57] pageserver: Reduce tracing overhead in timeline::get
 (#6115)

## Problem

Compaction process (specifically the image layer reconstructions part)
is lagging behind wal ingest (at speed ~10-15MB/s) for medium-sized
tenants (30-50GB). CPU profile shows that significant amount of time
(see flamegraph) is being spent in `tracing::span::Span::new`.

mainline (commit: 0ba4cae491c26c7678f7abddb68bf76134a7df90):

![reconstruct-mainline-0ba4cae491c2](https://github.com/neondatabase/neon/assets/289788/ebfd262e-5c97-4858-80c7-664a1dbcc59d)

## Summary of changes

By lowering the tracing level in get_value_reconstruct_data and
get_or_maybe_download from info to debug, we can reduce the overhead of
span creation in prod environments. On my system, this sped up the image
reconstruction process by 60% (from 14500 to 23160 page reconstruction
per sec)

pr:

![reconstruct-opt-2](https://github.com/neondatabase/neon/assets/289788/563a159b-8f2f-4300-b0a1-6cd66e7df769)


`create_image_layers()` (it's 1 CPU bound here) mainline vs pr:

![image](https://github.com/neondatabase/neon/assets/289788/a981e3cb-6df9-4882-8a94-95e99c35aa83)
---
 pageserver/src/tenant/storage_layer/layer.rs | 160 ++++++++++---------
 test_runner/regress/test_broken_timeline.py  |   4 +-
 2 files changed, 85 insertions(+), 79 deletions(-)
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a4b102c314..9a8ddc1a6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,8 +259,9 @@ impl Layer {
 
         layer
             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
             .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
     /// Download the layer if evicted.
@@ -654,7 +655,6 @@ impl LayerInner {
     }
 
     /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
     async fn get_or_maybe_download(
         self: &Arc<Self>,
         allow_download: bool,
@@ -663,95 +663,101 @@ impl LayerInner {
         let mut init_permit = None;
 
         loop {
-            let download = move |permit| async move {
-                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| {
+                async move {
+                    // disable any scheduled but not yet running eviction deletions for this
+                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
-                // no need to make the evict_and_wait wait for the actual download to complete
-                drop(self.status.send(Status::Downloaded));
+                    // no need to make the evict_and_wait wait for the actual download to complete
+                    drop(self.status.send(Status::Downloaded));
 
-                let timeline = self
-                    .timeline
-                    .upgrade()
-                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                    let timeline = self
+                        .timeline
+                        .upgrade()
+                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                // FIXME: grab a gate
+                    // FIXME: grab a gate
 
-                let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
-                // check if we really need to be downloaded; could have been already downloaded by a
-                // cancelled previous attempt.
-                let needs_download = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PreStatFailed)?;
+                    // check if we really need to be downloaded; could have been already downloaded by a
+                    // cancelled previous attempt.
+                    let needs_download = self
+                        .needs_download()
+                        .await
+                        .map_err(DownloadError::PreStatFailed)?;
 
-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
+                    let permit = if let Some(reason) = needs_download {
+                        if let NeedsDownload::NotFile(ft) = reason {
+                            return Err(DownloadError::NotFile(ft));
+                        }
+
+                        // only reset this after we've decided we really need to download. otherwise it'd
+                        // be impossible to mark cancelled downloads for eviction, like one could imagine
+                        // we would like to do for prefetching which was not needed.
+                        self.wanted_evicted.store(false, Ordering::Release);
+
+                        if !can_ever_evict {
+                            return Err(DownloadError::NoRemoteStorage);
+                        }
+
+                        if let Some(ctx) = ctx {
+                            self.check_expected_download(ctx)?;
+                        }
+
+                        if !allow_download {
+                            // this does look weird, but for LayerInner the "downloading" means also changing
+                            // internal once related state ...
+                            return Err(DownloadError::DownloadRequired);
+                        }
+
+                        tracing::info!(%reason, "downloading on-demand");
+
+                        self.spawn_download_and_wait(timeline, permit).await?
+                    } else {
+                        // the file is present locally, probably by a previous but cancelled call to
+                        // get_or_maybe_download. alternatively we might be running without remote storage.
+                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                        permit
+                    };
+
+                    let since_last_eviction =
+                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                    if let Some(since_last_eviction) = since_last_eviction {
+                        // FIXME: this will not always be recorded correctly until #6028 (the no
+                        // download needed branch above)
+                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                     }
 
-                    // only reset this after we've decided we really need to download. otherwise it'd
-                    // be impossible to mark cancelled downloads for eviction, like one could imagine
-                    // we would like to do for prefetching which was not needed.
-                    self.wanted_evicted.store(false, Ordering::Release);
+                    let res = Arc::new(DownloadedLayer {
+                        owner: Arc::downgrade(self),
+                        kind: tokio::sync::OnceCell::default(),
+                        version: next_version,
+                    });
 
-                    if !can_ever_evict {
-                        return Err(DownloadError::NoRemoteStorage);
+                    self.access_stats.record_residence_event(
+                        LayerResidenceStatus::Resident,
+                        LayerResidenceEventReason::ResidenceChange,
+                    );
+
+                    let waiters = self.inner.initializer_count();
+                    if waiters > 0 {
+                        tracing::info!(
+                            waiters,
+                            "completing the on-demand download for other tasks"
+                        );
                     }
 
-                    if let Some(ctx) = ctx {
-                        self.check_expected_download(ctx)?;
-                    }
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
 
-                    if !allow_download {
-                        // this does look weird, but for LayerInner the "downloading" means also changing
-                        // internal once related state ...
-                        return Err(DownloadError::DownloadRequired);
-                    }
-
-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
-                } else {
-                    // the file is present locally, probably by a previous but cancelled call to
-                    // get_or_maybe_download. alternatively we might be running without remote storage.
-                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
-
-                let since_last_eviction =
-                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    // FIXME: this will not always be recorded correctly until #6028 (the no
-                    // download needed branch above)
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
-
-                let res = Arc::new(DownloadedLayer {
-                    owner: Arc::downgrade(self),
-                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
-                });
-
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
-
-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
             };
 
             if let Some(init_permit) = init_permit.take() {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 53eeb8bbe9..4da0ba7b20 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*layer loading failed:.*",
+            ".*get_value_reconstruct_data for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="layer loading failed:") as err:
+    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"

From 00d90ce76a230d7afc9994df9fafd688c76ebd57 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 18 Dec 2023 16:04:47 +0100
Subject: [PATCH 02/57] Added cache for get role secret (#6165)

## Problem

Currently if we are getting many consecutive connections to the same
user/ep we will send a lot of traffic to the console.

## Summary of changes

Cache with ttl=4min proxy_get_role_secret response.

Note: this is the temporary hack, notifier listener is WIP.
---
 proxy/src/auth/backend.rs          |  9 ++-------
 proxy/src/bin/proxy.rs             | 16 +++++++++++++--
 proxy/src/config.rs                |  4 ++--
 proxy/src/console/provider.rs      | 13 +++++++++----
 proxy/src/console/provider/mock.rs |  6 +++---
 proxy/src/console/provider/neon.rs | 31 ++++++++++++++++++++++--------
 proxy/src/scram/key.rs             |  2 +-
 proxy/src/scram/secret.rs          |  1 +
 proxy/src/serverless/conn_pool.rs  |  1 -
 9 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3b09e05bd2..0c867dfd61 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,6 @@ use tokio_postgres::config::AuthKeys;
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
@@ -187,17 +186,13 @@ async fn auth_quirks(
     };
 
     info!("fetching user's authentication info");
-    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
-    let AuthInfo {
-        secret,
-        allowed_ips,
-    } = api.get_auth_info(extra, &info).await?;
+    let allowed_ips = api.get_allowed_ips(extra, &info).await?;
 
     // check allowed list
     if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let secret = secret.unwrap_or_else(|| {
+    let secret = api.get_role_secret(extra, &info).await?.unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ae4c42bcb1..be3989d387 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -6,6 +6,7 @@ use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::console::provider::AllowedIpsCache;
 use proxy::console::provider::NodeInfoCache;
+use proxy::console::provider::RoleSecretCache;
 use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
@@ -86,7 +87,7 @@ struct ProxyCliArgs {
     #[clap(long)]
     metric_collection_interval: Option<String>,
     /// cache for `wake_compute` api method (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     wake_compute_cache: String,
     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
@@ -127,8 +128,11 @@ struct ProxyCliArgs {
     #[clap(flatten)]
     aimd_config: proxy::rate_limiter::AimdConfig,
     /// cache for `allowed_ips` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
+    /// cache for `role_secret` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    role_secret_cache: String,
     /// disable ip check for http requests. If it is too time consuming, it could be turned off.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_ip_check_for_http: bool,
@@ -266,9 +270,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         AuthBackend::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
+            let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
+            info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches {
                 node_info: NodeInfoCache::new(
                     "node_info_cache",
@@ -282,6 +288,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                     allowed_ips_cache_config.ttl,
                     false,
                 ),
+                role_secret: RoleSecretCache::new(
+                    "role_secret_cache",
+                    role_secret_cache_config.size,
+                    role_secret_cache_config.ttl,
+                    false,
+                ),
             }));
 
             let config::WakeComputeLockOptions {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f932df4058..2ed248af8d 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -310,10 +310,10 @@ pub struct CacheOptions {
 
 impl CacheOptions {
     /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";
+    pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
 
     /// Parse cache options passed via cmdline.
-    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
     fn parse(options: &str) -> anyhow::Result<Self> {
         let mut size = None;
         let mut ttl = None;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 8d399f26ea..7ef5e950b0 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -10,6 +10,7 @@ use crate::{
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
+use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::{
     sync::{OwnedSemaphorePermit, Semaphore},
@@ -216,6 +217,7 @@ impl ConsoleReqExtra {
 }
 
 /// Auth secret which is managed by the cloud.
+#[derive(Clone)]
 pub enum AuthSecret {
     #[cfg(feature = "testing")]
     /// Md5 hash of user's password.
@@ -250,18 +252,19 @@ pub struct NodeInfo {
 
 pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
 pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
-pub type AllowedIpsCache = TimedLru<Arc<str>, Arc<Vec<String>>>;
+pub type AllowedIpsCache = TimedLru<SmolStr, Arc<Vec<String>>>;
+pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option<AuthSecret>>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
 #[async_trait]
 pub trait Api {
     /// Get the client's auth secret for authentication.
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, errors::GetAuthInfoError>;
+    ) -> Result<Option<AuthSecret>, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
@@ -282,7 +285,9 @@ pub struct ApiCaches {
     /// Cache for the `wake_compute` API method.
     pub node_info: NodeInfoCache,
     /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead.
-    pub allowed_ips: TimedLru<Arc<str>, Arc<Vec<String>>>,
+    pub allowed_ips: AllowedIpsCache,
+    /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead.
+    pub role_secret: RoleSecretCache,
 }
 
 /// Various caches for [`console`](super).
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index c464b4daf2..9c4a7447c6 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -142,12 +142,12 @@ async fn get_execute_postgres_query(
 #[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(creds).await
+    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
+        Ok(self.do_get_auth_info(creds).await?.secret)
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f748c9a41f..5bb91313c4 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -159,12 +159,24 @@ impl Api {
 #[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(extra, creds).await
+    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
+        if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
+            return Ok(role_secret.clone());
+        }
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        self.caches
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret.clone());
+        self.caches
+            .allowed_ips
+            .insert(ep, Arc::new(auth_info.allowed_ips));
+        Ok(auth_info.secret)
     }
 
     async fn get_allowed_ips(
@@ -172,8 +184,7 @@ impl super::Api for Api {
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
-        let key: &str = &creds.endpoint;
-        if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
+        if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -182,10 +193,14 @@ impl super::Api for Api {
         ALLOWED_IPS_BY_CACHE_OUTCOME
             .with_label_values(&["miss"])
             .inc();
-        let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips);
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
         self.caches
-            .allowed_ips
-            .insert(key.into(), allowed_ips.clone());
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret);
+        self.caches.allowed_ips.insert(ep, allowed_ips.clone());
         Ok(allowed_ips)
     }
 
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index e9c65fcef3..bd93fb2b70 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32;
 /// One of the keys derived from the [password](super::password::SaltedPassword).
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
-#[derive(Default, PartialEq, Eq)]
+#[derive(Clone, Default, PartialEq, Eq)]
 #[repr(transparent)]
 pub struct ScramKey {
     bytes: [u8; SCRAM_KEY_LEN],
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 424beccec9..9e74e07af1 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -5,6 +5,7 @@ use super::key::ScramKey;
 
 /// Server secret is produced from [password](super::password::SaltedPassword)
 /// and is used throughout the authentication process.
+#[derive(Clone)]
 pub struct ServerSecret {
     /// Number of iterations for `PBKDF2` function.
     pub iterations: u32,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index ab8903418b..df2d1bea32 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -431,7 +431,6 @@ async fn connect_to_compute(
         application_name: APP_NAME.to_string(),
         options: console_options,
     };
-    // TODO(anna): this is a bit hacky way, consider using console notification listener.
     if !config.disable_ip_check_for_http {
         let allowed_ips = backend.get_allowed_ips(&extra).await?;
         if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {

From 4ea4812ab2a9909cae30562e2bf6e1dd02b79691 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Dec 2023 15:47:09 +0000
Subject: [PATCH 03/57] tests: update python dependencies (#6164)

## Problem

Existing dependencies didn't work on Fedora 39 (python 3.12)

## Summary of changes

- Update pyyaml 6.0 -> 6.0.1
- Update yarl 1.8.2->1.9.4
- Update the `dnf install` line in README to include dependencies of
python packages (unrelated to upgrades, just noticed absences while
doing fresh pysync run)
---
 README.md   |   5 +-
 poetry.lock | 258 +++++++++++++++++++++++++++++-----------------------
 2 files changed, 145 insertions(+), 118 deletions(-)

diff --git a/README.md b/README.md
index 3e3123f5ee..98af1edee6 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,14 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
+libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
+  libffi-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
diff --git a/poetry.lock b/poetry.lock
index 8583a71f85..76dfd6d37d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2092,51 +2092,61 @@ files = [
 
 [[package]]
 name = "pyyaml"
-version = "6.0"
+version = "6.0.1"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
-    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
-    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
-    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
-    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
-    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
-    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
-    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
-    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
-    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
-    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
-    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
-    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
-    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
-    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
-    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
+    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
+    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
 [[package]]
@@ -2553,85 +2563,101 @@ files = [
 
 [[package]]
 name = "yarl"
-version = "1.8.2"
+version = "1.9.4"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
-    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
-    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
-    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
-    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
-    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
-    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
-    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
-    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
-    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
-    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
-    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
-    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
+    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
+    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
+    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
+    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
+    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
+    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
+    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
+    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
+    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
+    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
+    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
+    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
+    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
+    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
+    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
 ]
 
 [package.dependencies]

From 1f9a7d1cd0a94a7c539c4fc9ff194d4fdf2917c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Dec 2023 19:17:19 +0100
Subject: [PATCH 04/57] add a Rust client for Pageserver page_service (#6128)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771

Stacked atop https://github.com/neondatabase/neon/pull/6145
---
 Cargo.lock                            |   9 ++
 libs/pageserver_api/src/models.rs     |  93 +++++++++++++++-
 pageserver/Cargo.toml                 |   1 +
 pageserver/client/Cargo.toml          |   8 ++
 pageserver/client/src/lib.rs          |   1 +
 pageserver/client/src/page_service.rs | 151 ++++++++++++++++++++++++++
 6 files changed, 257 insertions(+), 6 deletions(-)
 create mode 100644 pageserver/client/src/page_service.rs

diff --git a/Cargo.lock b/Cargo.lock
index f931fd6c29..9a367effbb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3145,6 +3145,7 @@ dependencies = [
  "tokio",
  "tokio-io-timeout",
  "tokio-postgres",
+ "tokio-stream",
  "tokio-tar",
  "tokio-util",
  "toml_edit",
@@ -3182,11 +3183,19 @@ dependencies = [
 name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "async-trait",
+ "bytes",
+ "futures",
  "pageserver_api",
+ "postgres",
  "reqwest",
  "serde",
  "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
  "utils",
  "workspace_hack",
 ]
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a78ba8ad94..0f5e202249 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,6 +2,7 @@ pub mod partitioning;
 
 use std::{
     collections::HashMap,
+    io::Read,
     num::{NonZeroU64, NonZeroUsize},
     time::SystemTime,
 };
@@ -19,7 +20,7 @@ use utils::{
 
 use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 /// The state of a tenant in this pageserver.
 ///
@@ -576,6 +577,7 @@ pub enum PagestreamFeMessage {
 }
 
 // Wrapped in libpq CopyData
+#[derive(strum_macros::EnumProperty)]
 pub enum PagestreamBeMessage {
     Exists(PagestreamExistsResponse),
     Nblocks(PagestreamNblocksResponse),
@@ -584,6 +586,29 @@ pub enum PagestreamBeMessage {
     DbSize(PagestreamDbSizeResponse),
 }
 
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamBeMessageTag {
+    Exists = 100,
+    Nblocks = 101,
+    GetPage = 102,
+    Error = 103,
+    DbSize = 104,
+}
+impl TryFrom<u8> for PagestreamBeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            100 => Ok(PagestreamBeMessageTag::Exists),
+            101 => Ok(PagestreamBeMessageTag::Nblocks),
+            102 => Ok(PagestreamBeMessageTag::GetPage),
+            103 => Ok(PagestreamBeMessageTag::Error),
+            104 => Ok(PagestreamBeMessageTag::DbSize),
+            _ => Err(value),
+        }
+    }
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
     pub latest: bool,
@@ -739,35 +764,91 @@ impl PagestreamBeMessage {
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
+        use PagestreamBeMessageTag as Tag;
         match self {
             Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Exists as u8);
                 bytes.put_u8(resp.exists as u8);
             }
 
             Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Nblocks as u8);
                 bytes.put_u32(resp.n_blocks);
             }
 
             Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::GetPage as u8);
                 bytes.put(&resp.page[..]);
             }
 
             Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Error as u8);
                 bytes.put(resp.message.as_bytes());
                 bytes.put_u8(0); // null terminator
             }
             Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::DbSize as u8);
                 bytes.put_i64(resp.db_size);
             }
         }
 
         bytes.into()
     }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+
+        use PagestreamBeMessageTag as Tag;
+        let ok =
+            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
+                Tag::Exists => {
+                    let exists = buf.read_u8()?;
+                    Self::Exists(PagestreamExistsResponse {
+                        exists: exists != 0,
+                    })
+                }
+                Tag::Nblocks => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
+                }
+                Tag::GetPage => {
+                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
+                    buf.read_exact(&mut page)?;
+                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
+                }
+                Tag::Error => {
+                    let buf = buf.get_ref();
+                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                    let rust_str = cstr.to_str()?;
+                    PagestreamBeMessage::Error(PagestreamErrorResponse {
+                        message: rust_str.to_owned(),
+                    })
+                }
+                Tag::DbSize => {
+                    let db_size = buf.read_i64::<BigEndian>()?;
+                    Self::DbSize(PagestreamDbSizeResponse { db_size })
+                }
+            };
+        let remaining = buf.into_inner();
+        if !remaining.is_empty() {
+            anyhow::bail!(
+                "remaining bytes in msg with tag={msg_tag}: {}",
+                remaining.len()
+            );
+        }
+        Ok(ok)
+    }
+
+    pub fn kind(&self) -> &'static str {
+        match self {
+            Self::Exists(_) => "Exists",
+            Self::Nblocks(_) => "Nblocks",
+            Self::GetPage(_) => "GetPage",
+            Self::Error(_) => "Error",
+            Self::DbSize(_) => "DbSize",
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9e8172c6a1..980fbab22e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -63,6 +63,7 @@ thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index 4bd36185a6..0ed27602cd 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -12,3 +12,11 @@ reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
index 3963fd466c..4a3f4dea47 100644
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1 +1,2 @@
 pub mod mgmt_api;
+pub mod page_service;
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
new file mode 100644
index 0000000000..fc0d2311f7
--- /dev/null
+++ b/pageserver/client/src/page_service.rs
@@ -0,0 +1,151 @@
+use std::pin::Pin;
+
+use futures::SinkExt;
+use pageserver_api::{
+    models::{
+        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
+        PagestreamGetPageResponse,
+    },
+    reltag::RelTag,
+};
+use tokio::task::JoinHandle;
+use tokio_postgres::CopyOutStream;
+use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+pub struct Client {
+    client: tokio_postgres::Client,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct BasebackupRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub lsn: Option<Lsn>,
+    pub gzip: bool,
+}
+
+impl Client {
+    pub async fn new(connstring: String) -> anyhow::Result<Self> {
+        let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
+
+        let conn_task_cancel = CancellationToken::new();
+        let conn_task = tokio::spawn({
+            let conn_task_cancel = conn_task_cancel.clone();
+            async move {
+                tokio::select! {
+                    _ = conn_task_cancel.cancelled() => { }
+                    res = connection => {
+                        res.unwrap();
+                    }
+                }
+            }
+        });
+        Ok(Self {
+            cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
+            conn_task,
+            client,
+        })
+    }
+
+    pub async fn pagestream(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<PagestreamClient> {
+        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
+            .client
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .await?;
+        let Client {
+            cancel_on_client_drop,
+            conn_task,
+            client: _,
+        } = self;
+        Ok(PagestreamClient {
+            copy_both: Box::pin(copy_both),
+            conn_task,
+            cancel_on_client_drop,
+        })
+    }
+
+    pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
+        let BasebackupRequest {
+            tenant_id,
+            timeline_id,
+            lsn,
+            gzip,
+        } = req;
+        let mut args = Vec::with_capacity(5);
+        args.push("basebackup".to_string());
+        args.push(format!("{tenant_id}"));
+        args.push(format!("{timeline_id}"));
+        if let Some(lsn) = lsn {
+            args.push(format!("{lsn}"));
+        }
+        if *gzip {
+            args.push("--gzip".to_string())
+        }
+        Ok(self.client.copy_out(&args.join(" ")).await?)
+    }
+}
+
+/// Create using [`Client::pagestream`].
+pub struct PagestreamClient {
+    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct RelTagBlockNo {
+    pub rel_tag: RelTag,
+    pub block_no: u32,
+}
+
+impl PagestreamClient {
+    pub async fn shutdown(mut self) {
+        let _ = self.cancel_on_client_drop.take();
+        self.conn_task.await.unwrap();
+    }
+
+    pub async fn getpage(
+        &mut self,
+        key: RelTagBlockNo,
+        lsn: Lsn,
+    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
+        let req = PagestreamFeMessage::GetPage(req);
+        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));
+
+        self.copy_both.send_all(&mut req).await?;
+
+        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+        let next: bytes::Bytes = next.unwrap()?;
+
+        let msg = PagestreamBeMessage::deserialize(next)?;
+        match msg {
+            PagestreamBeMessage::GetPage(p) => Ok(p),
+            PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
+            PagestreamBeMessage::Exists(_)
+            | PagestreamBeMessage::Nblocks(_)
+            | PagestreamBeMessage::DbSize(_) => {
+                anyhow::bail!(
+                    "unexpected be message kind in response to getpage request: {}",
+                    msg.kind()
+                )
+            }
+        }
+    }
+}

From 62737f37767db150383397b99a52e16217e40e4a Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 18 Dec 2023 10:05:39 -0800
Subject: [PATCH 05/57] Grant BYPASSRLS and REPLICATION explicitly to
 neon_superuser roles

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba1ee6d1b2..20299c8fde 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                 // from neon_superuser.
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);

From 82215d20b01c82eb8dd3aebef724854008cdcdb3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 18 Dec 2023 21:05:24 +0200
Subject: [PATCH 06/57] Mark some variables 'static'

Move initialization of neon_redo_read_buffer_filter. This allows
marking it 'static', too.
---
 pgxn/neon/libpagestore.c   |  9 ++-------
 pgxn/neon/neon.h           |  7 -------
 pgxn/neon/pagestore_smgr.c | 15 +++++++++++----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 16406ce8a3..5056a3c5ff 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -17,7 +17,6 @@
 #include "pagestore_client.h"
 #include "fmgr.h"
 #include "access/xlog.h"
-#include "access/xlogutils.h"
 #include "storage/buf_internals.h"
 #include "storage/lwlock.h"
 #include "storage/ipc.h"
@@ -62,8 +61,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int			n_reconnect_attempts = 0;
-int			max_reconnect_attempts = 60;
+static int n_reconnect_attempts = 0;
+static int max_reconnect_attempts = 60;
 
 #define MAX_PAGESERVER_CONNSTRING_SIZE 256
 
@@ -83,8 +82,6 @@ static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
-
 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
 
@@ -627,8 +624,6 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
-		old_redo_read_buffer_filter = redo_read_buffer_filter;
-		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 
 	lfc_init();
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 897a8373a1..c3afecc679 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -27,13 +27,6 @@ extern void pg_init_walproposer(void);
 
 extern void pg_init_extension_server(void);
 
-/*
- * Returns true if we shouldn't do REDO on that block in record indicated by
- * block_id; false otherwise.
- */
-extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 609d80588c..99e6583ab2 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -47,9 +47,10 @@
 
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlogdefs.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
+#include "access/xlogutils.h"
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
@@ -106,6 +107,9 @@ typedef enum
 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
+static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 /*
  * Prefetch implementation:
  *
@@ -239,7 +243,7 @@ typedef struct PrefetchState
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
-PrefetchState *MyPState;
+static PrefetchState *MyPState;
 
 #define GetPrfSlot(ring_index) ( \
 	( \
@@ -257,7 +261,7 @@ PrefetchState *MyPState;
 	) \
 )
 
-XLogRecPtr	prefetch_lsn = 0;
+static XLogRecPtr prefetch_lsn = 0;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
@@ -1371,6 +1375,9 @@ neon_init(void)
 	MyPState->prf_hash = prfh_create(MyPState->hashctx,
 									 readahead_buffer_size, NULL);
 
+	old_redo_read_buffer_filter = redo_read_buffer_filter;
+	redo_read_buffer_filter = neon_redo_read_buffer_filter;
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -2869,7 +2876,7 @@ get_fsm_physical_block(BlockNumber heapblk)
  * contents, where with REDO locking it would wait on block 1 and see
  * block 3 with post-REDO contents only.
  */
-bool
+static bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;

From c4c48cfd6344ece6ef9669db388ae17de3d0972e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 18 Dec 2023 21:05:29 +0200
Subject: [PATCH 07/57] Clean up #includes

- No need to include c.h, port.h or pg_config.h, they are included in
  postgres.h
- No need to include postgres.h in header files. Instead, the
  assumption in PostgreSQL is that all .c files include postgres.h.
- Reorder includes to alphabetical order, and system headers before
  pgsql headers
- Remove bunch of other unnecessary includes that got copy-pasted from
  one source file to another
---
 libs/walproposer/bindgen_deps.h     |  1 +
 pgxn/neon/control_plane_connector.c | 19 ++++++++++---------
 pgxn/neon/extension_server.c        | 15 ++-------------
 pgxn/neon/file_cache.c              | 16 +++++++---------
 pgxn/neon/libpagestore.c            | 21 +++++++++------------
 pgxn/neon/neon_utils.c              | 27 +--------------------------
 pgxn/neon/neon_utils.h              |  2 --
 pgxn/neon/pagestore_client.h        |  7 ++-----
 pgxn/neon/pagestore_smgr.c          | 10 +++++-----
 pgxn/neon/walproposer.h             |  8 +++-----
 pgxn/neon/walproposer_compat.c      |  6 ++++--
 11 files changed, 44 insertions(+), 88 deletions(-)

diff --git a/libs/walproposer/bindgen_deps.h b/libs/walproposer/bindgen_deps.h
index b95788347c..41ee1cd4a3 100644
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1 +1,2 @@
+#include "postgres.h"
 #include "walproposer.h"
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 2e7da671f9..e467a9c43a 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -19,20 +19,21 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+
+#include <curl/curl.h>
+
+#include "access/xact.h"
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "libpq/crypt.h"
+#include "miscadmin.h"
 #include "tcop/pquery.h"
 #include "tcop/utility.h"
-#include "access/xact.h"
+#include "utils/acl.h"
+#include "utils/guc.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include <curl/curl.h>
 #include "utils/jsonb.h"
-#include "libpq/crypt.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index fbbb8fd448..d9a75142f1 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -1,4 +1,3 @@
-
 /*-------------------------------------------------------------------------
  *
  * extension_server.c
@@ -10,21 +9,11 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
-#include "tcop/pquery.h"
-#include "tcop/utility.h"
-#include "access/xact.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include "fmgr.h"
 
 #include <curl/curl.h>
 
+#include "utils/guc.h"
+
 static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 54b3661e66..53258f4e49 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -13,32 +13,30 @@
  *-------------------------------------------------------------------------
  */
 
+#include "postgres.h"
+
 #include <sys/file.h>
 #include <unistd.h>
 #include <fcntl.h>
 
-#include "postgres.h"
-
 #include "neon_pgversioncompat.h"
 
+#include "access/parallel.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pgstat.h"
 #include "pagestore_client.h"
-#include "access/parallel.h"
+#include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
-#include "storage/latch.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/latch.h"
 #include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/builtins.h"
 #include "utils/dynahash.h"
 #include "utils/guc.h"
-#include "storage/fd.h"
-#include "storage/pg_shmem.h"
-#include "storage/buf_internals.h"
-#include "pgstat.h"
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5056a3c5ff..3b038f906f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -14,27 +14,24 @@
  */
 #include "postgres.h"
 
-#include "pagestore_client.h"
-#include "fmgr.h"
 #include "access/xlog.h"
-#include "storage/buf_internals.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
-#include "storage/pg_shmem.h"
-#include "c.h"
-#include "postmaster/interrupt.h"
-
+#include "fmgr.h"
 #include "libpq-fe.h"
-#include "libpq/pqformat.h"
 #include "libpq/libpq.h"
-
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/interrupt.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/guc.h"
 
 #include "neon.h"
-#include "walproposer.h"
 #include "neon_utils.h"
+#include "pagestore_client.h"
+#include "walproposer.h"
 
 #define PageStoreTrace DEBUG5
 
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 807d2decf6..9135847aaf 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -3,33 +3,8 @@
 
 #include "postgres.h"
 
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
+#include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif
 
 /*
  * Convert a character which represents a hexadecimal digit to an integer.
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 20745d8b26..a86f1e061c 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,8 +1,6 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__
 
-#include "postgres.h"
-
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index ecfadb01d6..225959ef64 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -13,19 +13,16 @@
 #ifndef pageserver_h
 #define pageserver_h
 
-#include "postgres.h"
 #include "neon_pgversioncompat.h"
 
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
-#include "storage/block.h"
-#include "storage/smgr.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
 #include "utils/memutils.h"
 
-#include "pg_config.h"
-
 typedef enum
 {
 	/* pagestore_client -> pagestore */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 99e6583ab2..8888cd89c6 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -54,19 +54,19 @@
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
-#include "pagestore_client.h"
-#include "postmaster/interrupt.h"
+#include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/interrupt.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 #include "storage/fsm_internals.h"
-#include "storage/smgr.h"
 #include "storage/md.h"
-#include "pgstat.h"
+#include "storage/smgr.h"
+
+#include "pagestore_client.h"
 
 #if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
 #include "access/xlogrecovery.h"
 #endif
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 615018c58e..6ba2aae75b 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,14 +1,12 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__
 
-#include "postgres.h"
-#include "access/xlogdefs.h"
-#include "port.h"
-#include "access/xlog_internal.h"
 #include "access/transam.h"
+#include "access/xlogdefs.h"
+#include "access/xlog_internal.h"
 #include "nodes/replnodes.h"
-#include "utils/uuid.h"
 #include "replication/walreceiver.h"
+#include "utils/uuid.h"
 
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c
index 04b519ab15..35d984c52e 100644
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -3,11 +3,13 @@
  * This is needed to avoid linking to full postgres server installation. This file
  * is compiled as a part of libwalproposer static library.
  */
+#include "postgres.h"
 
 #include <stdio.h>
-#include "walproposer.h"
-#include "utils/datetime.h"
+
 #include "miscadmin.h"
+#include "utils/datetime.h"
+#include "walproposer.h"
 
 void
 ExceptionalCondition(const char *conditionName,

From 6939fc3db6d18569ae2d11fca4abdef1689841d4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 18 Dec 2023 21:05:31 +0200
Subject: [PATCH 08/57] Remove declarations of non-existent global variables
 and functions

FileCacheMonitorMain was removed in commit b497d0094e.
---
 pgxn/neon/file_cache.c       | 2 --
 pgxn/neon/pagestore_client.h | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 53258f4e49..6725ce8fff 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -100,8 +100,6 @@ static shmem_request_hook_type prev_shmem_request_hook;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
-void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
-
 /*
  * Local file cache is optional and Neon can work without it.
  * In case of any any errors with this cache, we should disable it but to not throw error.
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 225959ef64..3fcaab0bee 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -155,11 +155,8 @@ extern page_server_api *page_server;
 extern char *page_server_connstring;
 extern int	flush_every_n_requests;
 extern int	readahead_buffer_size;
-extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
-extern bool wal_redo;
 extern int32 max_cluster_size;
 
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);

From 6e6e40dd7fc81f45166a008e9f8f957f1345a420 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 18 Dec 2023 23:24:22 +0100
Subject: [PATCH 09/57] Invalidate credentials on auth failure (#6171)

## Problem

If the user reset password, cache could receive this information only
after `ttl` minutes.

## Summary of changes

Invalidate password on auth failure.
---
 proxy/src/auth.rs                  |  4 ++++
 proxy/src/auth/backend.rs          | 34 +++++++++++++++++++++++++++++-
 proxy/src/console/provider.rs      |  3 ++-
 proxy/src/console/provider/mock.rs |  7 ++++--
 proxy/src/console/provider/neon.rs | 12 ++++++-----
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index eadb9abd43..64ef108e11 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -87,6 +87,10 @@ impl AuthError {
     pub fn too_many_connections() -> Self {
         AuthErrorImpl::TooManyConnections.into()
     }
+
+    pub fn is_auth_failed(&self) -> bool {
+        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 0c867dfd61..923bd02560 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -192,14 +192,46 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let secret = api.get_role_secret(extra, &info).await?.unwrap_or_else(|| {
+    let cached_secret = api.get_role_secret(extra, &info).await?;
+
+    let secret = cached_secret.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
         info!("authentication info not found, mocking it");
         AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
     });
+    match authenticate_with_secret(
+        secret,
+        info,
+        client,
+        unauthenticated_password,
+        allow_cleartext,
+        config,
+        latency_timer,
+    )
+    .await
+    {
+        Ok(keys) => Ok(keys),
+        Err(e) => {
+            if e.is_auth_failed() {
+                // The password could have been changed, so we invalidate the cache.
+                cached_secret.invalidate();
+            }
+            Err(e)
+        }
+    }
+}
 
+async fn authenticate_with_secret(
+    secret: AuthSecret,
+    info: ComputeUserInfo,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
+    unauthenticated_password: Option<Vec<u8>>,
+    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7ef5e950b0..e4cf1e8c8e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -254,6 +254,7 @@ pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
 pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
 pub type AllowedIpsCache = TimedLru<SmolStr, Arc<Vec<String>>>;
 pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option<AuthSecret>>;
+pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -264,7 +265,7 @@ pub trait Api {
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<Option<AuthSecret>, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 9c4a7447c6..dba5e5863f 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,6 +6,7 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
+use crate::console::provider::CachedRoleSecret;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use async_trait::async_trait;
 use futures::TryFutureExt;
@@ -146,8 +147,10 @@ impl super::Api for Api {
         &self,
         _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
-        Ok(self.do_get_auth_info(creds).await?.secret)
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(creds).await?.secret,
+        ))
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 5bb91313c4..628d98df49 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,7 +3,8 @@
 use super::{
     super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra,
+    NodeInfo,
 };
 use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
@@ -163,20 +164,21 @@ impl super::Api for Api {
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let ep = creds.endpoint.clone();
         let user = creds.inner.user.clone();
         if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
-            return Ok(role_secret.clone());
+            return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(extra, creds).await?;
-        self.caches
+        let (_, secret) = self
+            .caches
             .role_secret
             .insert((ep.clone(), user), auth_info.secret.clone());
         self.caches
             .allowed_ips
             .insert(ep, Arc::new(auth_info.allowed_ips));
-        Ok(auth_info.secret)
+        Ok(secret)
     }
 
     async fn get_allowed_ips(

From c272c68e5c0715a55441c1db8235ea57662a8cc0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 11:20:56 +0100
Subject: [PATCH 10/57] RFC: Per-Tenant GetPage@LSN Throttling (#5648)

Implementation epic: https://github.com/neondatabase/neon/issues/5899
---
 docs/rfcs/029-getpage-throttling.md | 197 ++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 docs/rfcs/029-getpage-throttling.md

diff --git a/docs/rfcs/029-getpage-throttling.md b/docs/rfcs/029-getpage-throttling.md
new file mode 100644
index 0000000000..b4f9adefc5
--- /dev/null
+++ b/docs/rfcs/029-getpage-throttling.md
@@ -0,0 +1,197 @@
+# Per-Tenant GetPage@LSN Throttling
+
+Author: Christian Schwarz
+Date: Oct 24, 2023
+
+## Summary
+
+This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
+and the interactions with its client, i.e., the neon_smgr component in Compute.
+
+The result of implementing & executing this RFC will be a fleet-wide upper limit for
+**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
+
+## Background
+
+### GetPage@LSN Request Flow
+
+Pageserver exposes its `page_service.rs` as a libpq listener.
+The Computes' `neon_smgr` module connects to that libpq listener.
+Once a connection is established, the protocol allows Compute to request page images at a given LSN.
+We call these requests GetPage@LSN requests, or GetPage requests for short.
+Other request types can be sent, but these are low traffic compared to GetPage requests
+and are not the concern of this RFC.
+
+Pageserver associates one libpq connection with one tokio task.
+
+Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
+Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
+Requests are processed in the order in which they arrive via the TCP-based pq protocol.
+So, there is no concurrent request processing within one connection/task.
+
+There is a degree of natural pipelining:
+Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
+And Pageserver can fill the pipe with responses in the other direction.
+Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
+
+### GetPage@LSN Access Pattern
+
+The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
+Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
+
+If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
+
+## Motivation
+
+In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
+then dropping to ca 18k GetPage/second for a period of 9h.
+
+We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
+the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
+
+Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
+
+However, here are some illustrative data points for the 155k period:
+The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
+We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
+The CPU utilization of the instance was 75% user+system.
+Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
+The hit rate for materialized pages was ca. 40%.
+Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
+
+The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
+The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
+The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
+
+My claim is that it was **unhealthy to serve this workload at the pace we did**:
+* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
+* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
+    * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
+      At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
+      The result is an **uneven** performance profile from the Compute perspective.
+
+    * **economics**: Neon currently does not charge for IOPS, only capacity.
+      **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
+      For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
+      Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
+      We charge 0$.
+      It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
+
+
+Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
+vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
+So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
+
+## Solution: Throttling GetPage Requests
+
+**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
+That is, unless we want to start charging for provisioned GetPage@LSN/second.
+Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
+Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
+
+## The Design Space
+
+What that remains is the question about *policy* and *mechanism*:
+
+**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
+Candidates are:
+
+* hard limit, same limit value per connection|timeline|tenant
+    * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
+      This is a major operational pain point / risk right now.
+* hard limit, configurable per connection|timeline|tenant
+    * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
+    * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
+* fair share among active connections|timelines|tenants per instance
+    * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
+    * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
+* ...
+
+
+Regarding **mechanism**, it's clear that **backpressure** is the way to go.
+However, we must choose between
+* **implicit** backpressure through pq/TCP and
+* **explicit** rejection of requests + retries with exponential backoff
+
+Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
+where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
+
+And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
+how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
+
+## Scope Of This RFC
+
+**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
+
+This proposal is easy to implement and significantly de-risks operating large Pageservers,
+based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
+
+For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
+Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
+If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
+
+The mechanism for backpressure will be TCP-based implicit backpressure.
+The compute team isn't concerned about prefetch queue depth.
+Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
+
+The rate limit will be implemented using a per-tenant token bucket.
+The bucket will be be shared among all connections to the tenant.
+The bucket implementation supports starvation-preventing `await`ing.
+The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
+The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
+can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
+A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
+
+Regarding metrics / the internal GetPage latency SLO:
+we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
+- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
+- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
+
+Further observability measures:
+- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
+  The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
+
+Rollout will happen as follows:
+- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
+- experimentation in staging and later production to study impact & interaction with auto-scaling
+- determination of a sensible global default value
+  - the value will be chosen as high as possible ...
+  - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
+- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
+- reset of the experimental per-tenant overrides
+- gain experience & lower the limit over time
+  - we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
+    once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
+
+The per-tenant override will remain for emergencies and testing.
+But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
+
+Toward the upper layers of the Neon stack, the resulting limit will be
+**"the highest GetPage/second that Pageserver can support for a single tenant"**.
+
+### Rationale
+
+We decided against error + retry because of worries about starvation.
+
+## Future Work
+
+Enable per-tenant emergency override of the limit via Console.
+Should be part of a more general framework to specify tenant config overrides.
+**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
+or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
+concerns itself with GetPage/second capacity planning.
+
+Compute-side metrics for GetPage latency.
+
+Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
+
+Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
+
+Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
+Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
+With per-tenant rate limiting, we will not meet that expectation.
+However, we can currently only scale per tenant.
+Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
+But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
+To solve this properly, I think we'll need replicas for tenants / shard.
+To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.

From a89d6dc76e8406ad15e45b190bc687b8b208c3e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 11:29:16 +0100
Subject: [PATCH 11/57] Always send a json response for
 timeline_get_lsn_by_timestamp (#6178)

As part of the transition laid out in
[this](https://github.com/neondatabase/cloud/pull/7553#discussion_r1370473911)
comment, don't read the `version` query parameter in
`timeline_get_lsn_by_timestamp`, but always return the structured json
response.

Follow-up of https://github.com/neondatabase/neon/pull/5608
---
 pageserver/src/http/routes.rs           | 37 ++++---------
 test_runner/fixtures/pageserver/http.py | 12 ++++-
 test_runner/regress/test_lsn_mapping.py | 71 ++-----------------------
 3 files changed, 24 insertions(+), 96 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 601fad5bde..bc8b677f77 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -592,8 +592,6 @@ async fn get_lsn_by_timestamp_handler(
         )));
     }
 
-    let version: Option<u8> = parse_query_param(&request, "version")?;
-
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let timestamp_raw = must_get_query_param(&request, "timestamp")?;
     let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -606,31 +604,18 @@ async fn get_lsn_by_timestamp_handler(
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
-
-    if version.unwrap_or(0) > 1 {
-        #[derive(serde::Serialize)]
-        struct Result {
-            lsn: Lsn,
-            kind: &'static str,
-        }
-        let (lsn, kind) = match result {
-            LsnForTimestamp::Present(lsn) => (lsn, "present"),
-            LsnForTimestamp::Future(lsn) => (lsn, "future"),
-            LsnForTimestamp::Past(lsn) => (lsn, "past"),
-            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-        };
-        json_response(StatusCode::OK, Result { lsn, kind })
-    } else {
-        // FIXME: this is a temporary crutch not to break backwards compatibility
-        // See https://github.com/neondatabase/neon/pull/5608
-        let result = match result {
-            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-            LsnForTimestamp::Future(_lsn) => "future".into(),
-            LsnForTimestamp::Past(_lsn) => "past".into(),
-            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        };
-        json_response(StatusCode::OK, result)
+    #[derive(serde::Serialize)]
+    struct Result {
+        lsn: Lsn,
+        kind: &'static str,
     }
+    let (lsn, kind) = match result {
+        LsnForTimestamp::Present(lsn) => (lsn, "present"),
+        LsnForTimestamp::Future(lsn) => (lsn, "future"),
+        LsnForTimestamp::Past(lsn) => (lsn, "past"),
+        LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+    };
+    json_response(StatusCode::OK, Result { lsn, kind })
 }
 
 async fn get_timestamp_of_lsn_handler(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index eda8813c36..add6c4288a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -510,13 +510,21 @@ class PageserverHttpClient(requests.Session):
         assert res_json is None
 
     def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        timestamp,
+        version: Optional[int] = None,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
+        if version is None:
+            version_str = ""
+        else:
+            version_str = f"&version={version}"
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index f79c1c347c..65d6d7a9fd 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -8,71 +8,6 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
-#
-# Test pageserver get_lsn_by_timestamp API
-#
-def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-
-    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
-    log.info("postgres is running on 'test_lsn_mapping' branch")
-
-    cur = endpoint_main.connect().cursor()
-    # Create table, and insert rows, each in a separate transaction
-    # Disable synchronous_commit to make this initialization go faster.
-    #
-    # Each row contains current insert LSN and the current timestamp, when
-    # the row was inserted.
-    cur.execute("SET synchronous_commit=off")
-    cur.execute("CREATE TABLE foo (x integer)")
-    tbl = []
-    for i in range(1000):
-        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
-        # Get the timestamp at UTC
-        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
-        tbl.append([i, after_timestamp])
-
-    # Execute one more transaction with synchronous_commit enabled, to flush
-    # all the previous transactions
-    cur.execute("SET synchronous_commit=on")
-    cur.execute("INSERT INTO foo VALUES (-1)")
-
-    # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
-
-    with env.pageserver.http_client() as client:
-        # Check edge cases: timestamp in the future
-        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
-        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-        )
-        assert result == "future"
-
-        # timestamp too the far history
-        probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-        )
-        assert result == "past"
-
-        # Probe a bunch of timestamps in the valid range
-        for i in range(1, len(tbl), 100):
-            probe_timestamp = tbl[i][1]
-            lsn = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-            )
-            # Call get_lsn_by_timestamp to get the LSN
-            # Launch a new read-only node at that LSN, and check that only the rows
-            # that were supposed to be committed at that point in time are visible.
-            endpoint_here = env.endpoints.create_start(
-                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
-            )
-            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
-
-            endpoint_here.stop_and_destroy()
-
-
 #
 # Test pageserver get_lsn_by_timestamp API
 #
@@ -130,7 +65,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "future"
         # make sure that we return a well advanced lsn here
@@ -139,7 +74,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
@@ -149,7 +84,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
             result = client.timeline_get_lsn_by_timestamp(
-                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
             )
             assert result["kind"] not in ["past", "nodata"]
             lsn = result["lsn"]

From fbb979d5e34d1a2aed6578faba72d3b6cad60366 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 11:29:50 +0100
Subject: [PATCH 12/57] remote_storage: move shared utilities for S3 and Azure
 into common module (#6176)

The PR does two things:

* move the util functions present in the remote_storage Azure and S3
test files into a shared one, deduplicating them.
* add a `s3_upload_download_works` test as a copy of the Azure test

The goal is mainly to fight duplication and make the code a little bit
more generic (like removing mentions of s3 and azure from function
names).

This is a first step towards #6146.
---
 libs/remote_storage/tests/common/mod.rs      | 200 +++++++++++++++
 libs/remote_storage/tests/test_real_azure.rs | 219 ++--------------
 libs/remote_storage/tests/test_real_s3.rs    | 253 +++++--------------
 3 files changed, 288 insertions(+), 384 deletions(-)
 create mode 100644 libs/remote_storage/tests/common/mod.rs

diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
new file mode 100644
index 0000000000..bca117ed1a
--- /dev/null
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -0,0 +1,200 @@
+use std::collections::HashSet;
+use std::ops::ControlFlow;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::Context;
+use bytes::Bytes;
+use camino::Utf8Path;
+use futures::stream::Stream;
+use once_cell::sync::OnceCell;
+use remote_storage::{Download, GenericRemoteStorage, RemotePath};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
+pub(crate) fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+pub(crate) fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
+
+pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
+    let mut buf = Vec::new();
+    tokio::io::copy_buf(
+        &mut tokio_util::io::StreamReader::new(dl.download_stream),
+        &mut buf,
+    )
+    .await?;
+    Ok(buf)
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+pub(crate) async fn upload_simple_remote_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} remote files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
+
+pub(crate) async fn cleanup(
+    client: &Arc<GenericRemoteStorage>,
+    objects_to_delete: HashSet<RemotePath>,
+) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+pub(crate) struct Uploads {
+    pub(crate) prefixes: HashSet<RemotePath>,
+    pub(crate) blobs: HashSet<RemotePath>,
+}
+
+pub(crate) async fn upload_remote_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} remote files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+pub(crate) fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+    });
+}
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 7327803198..0387dc30e7 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
 use remote_storage::{
-    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
 use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
+use tracing::{debug, info};
 
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+mod common;
+
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};
 
 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
 
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
 /// where
 /// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -97,7 +97,7 @@ async fn azure_pagination_should_work(
 /// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `Azure_pagination_should_work` for more information.
 ///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -218,18 +218,9 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
 
     ctx.client.upload(data, len, &path, None).await?;
 
-    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
-        let mut buf = Vec::new();
-        tokio::io::copy_buf(
-            &mut tokio_util::io::StreamReader::new(dl.download_stream),
-            &mut buf,
-        )
-        .await?;
-        Ok(buf)
-    }
     // Normal download request
     let dl = ctx.client.download(&path).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // Full range (end specified)
@@ -237,12 +228,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
         .client
         .download_byte_range(&path, 0, Some(len as u64))
         .await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // partial range (end specified)
     let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..10]);
 
     // partial range (end beyond real end)
@@ -250,17 +241,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
         .client
         .download_byte_range(&path, 8, Some(len as u64 * 100))
         .await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[8..]);
 
     // Partial range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..]);
 
     // Full range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
@@ -272,17 +263,6 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
     Ok(())
 }
 
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
-}
-
 struct EnabledAzure {
     client: Arc<GenericRemoteStorage>,
     base_prefix: &'static str,
@@ -352,7 +332,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
 
         let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -414,7 +394,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
 
         let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -478,166 +458,3 @@ fn create_azure_client(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
-// to binary
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index ecd834e61c..8f46b2abd6 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
+use tracing::{debug, info};
 
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+mod common;
+
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `s3_pagination_should_work` for more information.
 ///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -198,15 +198,65 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
     Ok(())
 }
 
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let MaybeEnabledS3::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
 }
 
 struct EnabledS3 {
@@ -278,7 +328,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
 
         let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -340,7 +390,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 
         let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -403,166 +453,3 @@ fn create_s3_client(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}

From 6ffbbb2e02916246ee17fc40a0d4accb90295bbd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 11:32:51 +0100
Subject: [PATCH 13/57] include timeline ids in tenant details response (#6166)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771

This allows getting the list of tenants and timelines without triggering
initial logical size calculation by requesting the timeline details API
response, which would skew our results.
---
 libs/pageserver_api/src/models.rs |  8 ++++++++
 pageserver/client/src/mgmt_api.rs | 12 ++++++++++++
 pageserver/src/http/routes.rs     | 14 +++++++++-----
 pageserver/src/tenant.rs          |  4 ++++
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0f5e202249..be41b610b8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -370,6 +370,14 @@ pub struct TenantInfo {
     pub attachment_status: TenantAttachmentStatus,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantDetails {
+    #[serde(flatten)]
+    pub tenant_info: TenantInfo,
+
+    pub timelines: Vec<TimelineId>,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 77eb1bb8e2..0ad4e1551e 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -64,6 +64,18 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn tenant_details(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<pageserver_api::models::TenantDetails> {
+        let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
+        self.get(uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn list_timelines(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bc8b677f77..e641e44b08 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::TenantDetails;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
@@ -857,11 +858,14 @@ async fn tenant_status(
         }
 
         let state = tenant.current_state();
-        Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_shard_id,
-            state: state.clone(),
-            current_physical_size: Some(current_physical_size),
-            attachment_status: state.attachment_status(),
+        Result::<_, ApiError>::Ok(TenantDetails {
+            tenant_info: TenantInfo {
+                id: tenant_shard_id,
+                state: state.clone(),
+                current_physical_size: Some(current_physical_size),
+                attachment_status: state.attachment_status(),
+            },
+            timelines: tenant.list_timeline_ids(),
         })
     }
     .instrument(info_span!("tenant_status_handler",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eceef6bf78..1d6f1001db 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1552,6 +1552,10 @@ impl Tenant {
             .collect()
     }
 
+    pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
+        self.timelines.lock().unwrap().keys().cloned().collect()
+    }
+
     /// This is used to create the initial 'main' timeline during bootstrapping,
     /// or when importing a new base backup. The caller is expected to load an
     /// initial image of the datadir to the new timeline after this.

From d89af4cf8e7a17077c197a259f2f95a5afdeb2c2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 19 Dec 2023 10:38:00 +0000
Subject: [PATCH 14/57] pageserver: downgrade 'connection reset' WAL errors
 (#6181)

This squashes a particularly noisy warn-level log that occurs when
safekeepers are restarted.

Unfortunately the error type from `tonic` doesn't provide a neat way of
matching this, so we use a string comparison
---
 .../src/tenant/timeline/walreceiver/connection_manager.rs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 7bfa246eeb..5a5b3d7586 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -138,7 +138,7 @@ pub(super) async fn connection_manager_loop_step(
                     Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(status) => {
                         match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => {
                                 // tonic's error handling doesn't provide a clear code for disconnections: we get
                                 // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
                                 info!("broker disconnected: {status}");

From b701394d7ab8aeeadd8221d9280ce5742a9509f4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 19 Dec 2023 02:27:23 +0300
Subject: [PATCH 15/57] Fix WAL waiting in walproposer for v16.

Just preparing cv right before waiting is not enough as we might have already
missed the flushptr change & wakeup, so re-checked before sleep.

https://neondb.slack.com/archives/C03QLRH7PPD/p1702830965396619?thread_ts=1702756761.836649&cid=C03QLRH7PPD
---
 pgxn/neon/walproposer_pg.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 551d56d416..79498b64af 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1482,6 +1482,21 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 #if PG_MAJORVERSION_NUM >= 16
 	if (WalSndCtl != NULL)
 		ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
+
+	/*
+	 * Now that we prepared the condvar, check flush ptr again -- it might have
+	 * changed before we subscribed to cv so we missed the wakeup.
+	 *
+	 * Do that only when we're interested in new WAL: without sync-safekeepers
+	 * and if election already passed.
+	 */
+	if (!wp->config->syncSafekeepers && wp->availableLsn != InvalidXLogRecPtr && GetFlushRecPtr(NULL) > wp->availableLsn)
+	{
+		ConditionVariableCancelSleep();
+		ResetLatch(MyLatch);
+		*events = WL_LATCH_SET;
+		return 1;
+	}
 #endif
 
 	/*

From 73d247c464ad6b72ee5a6d787d12dd93cddbc392 Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Tue, 19 Dec 2023 12:44:25 +0100
Subject: [PATCH 16/57] Analyze clickbench performance with explain plans and
 pg_stat_statements (#6161)

## Problem

To understand differences in performance between neon, aurora and rds we
want to collect explain analyze plans and pg_stat_statements for
selected benchmarking runs

## Summary of changes

Add workflow input options to collect explain and pg_stat_statements for
benchmarking workflow

Co-authored-by: BodoBolero <bodobolero@gmail.com>
---
 .github/workflows/benchmarking.yml        | 29 +++++++++++--
 test_runner/performance/test_perf_olap.py | 51 ++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index e2f15d96db..8bf12c31b1 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,7 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '0 3 * * *' # run once a day, timezone is utc
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
 
   workflow_dispatch: # adds ability to run this manually
     inputs:
@@ -23,6 +23,21 @@ on:
         type: boolean
         description: 'Publish perf report. If not set, the report will be published only for the main branch'
         required: false
+      collect_olap_explain:
+        type: boolean
+        description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
+        required: false
+        default: false
+      collect_pg_stat_statements:
+        type: boolean
+        description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
+        required: false
+        default: false
+      run_AWS_RDS_AND_AURORA:
+        type: boolean
+        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
+        required: false
+        default: false
 
 defaults:
   run:
@@ -113,6 +128,8 @@ jobs:
     # - neon-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+    env:
+      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
     runs-on: ubuntu-latest
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -152,7 +169,7 @@ jobs:
           ]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                    { "platform": "rds-aurora"   }]')
         fi
@@ -171,9 +188,9 @@ jobs:
           ]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -337,6 +354,8 @@ jobs:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
+      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
+      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
@@ -399,6 +418,8 @@ jobs:
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
+        TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         TEST_OLAP_SCALE: 10
 
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 0f7615f7ed..1de7e95bbe 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -17,6 +17,27 @@ class LabelledQuery:
     query: str
 
 
+# This must run before all tests in this module
+# create extension pg_stat_statements if it does not exist
+# and TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false)
+# Theoretically this could be in a module or session scope fixture,
+# however the code depends on other fixtures that have function scope
+@pytest.mark.skipif(
+    os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false",
+    reason="Skipping - Creating extension pg_stat_statements",
+)
+@pytest.mark.remote_cluster
+def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
+    log.info("Creating extension pg_stat_statements")
+    query = LabelledQuery(
+        "Q_CREATE_EXTENSION", r"CREATE EXTENSION IF NOT EXISTS pg_stat_statements;"
+    )
+    run_psql(remote_compare, query, times=1, explain=False)
+    log.info("Reset pg_stat_statements")
+    query = LabelledQuery("Q_RESET", r"SELECT pg_stat_statements_reset();")
+    run_psql(remote_compare, query, times=1, explain=False)
+
+
 # A list of queries to run.
 # Please do not alter the label for the query, as it is used to identify it.
 # Labels for ClickBench queries match the labels in ClickBench reports
@@ -78,6 +99,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
     # fmt: on
 )
 
+EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
+
 
 def get_scale() -> List[str]:
     # We parametrize each tpc-h and clickbench test with scale
@@ -88,7 +111,10 @@ def get_scale() -> List[str]:
     return [scale]
 
 
-def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
+# run the query times times plus once with EXPLAIN VERBOSE if explain is requestd
+def run_psql(
+    env: RemoteCompare, labelled_query: LabelledQuery, times: int, explain: bool = False
+) -> None:
     # prepare connstr:
     # - cut out password from connstr to pass it via env
     # - add options to connstr
@@ -108,6 +134,13 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N
         log.info(f"Run {run}/{times}")
         with env.zenbenchmark.record_duration(f"{label}/{run}"):
             env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)
+    if explain:
+        log.info(f"Explaining query {label}")
+        run += 1
+        with env.zenbenchmark.record_duration(f"{label}/EXPLAIN"):
+            env.pg_bin.run_capture(
+                ["psql", connstr, "-c", f"{EXPLAIN_STRING} {query}"], env=environ
+            )
 
 
 @pytest.mark.parametrize("scale", get_scale())
@@ -120,8 +153,9 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
     Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
     The DB prepared manually in advance
     """
+    explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"
 
-    run_psql(remote_compare, query, times=3)
+    run_psql(remote_compare, query, times=3, explain=explain)
 
 
 def tpch_queuies() -> Tuple[ParameterSet, ...]:
@@ -195,3 +229,16 @@ def test_user_examples(remote_compare: RemoteCompare):
         """,
     )
     run_psql(remote_compare, query, times=3)
+
+
+# This must run after all tests in this module
+# Collect pg_stat_statements after running the tests if TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false)
+@pytest.mark.skipif(
+    os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false",
+    reason="Skipping - Collecting pg_stat_statements",
+)
+@pytest.mark.remote_cluster
+def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
+    log.info("Collecting pg_stat_statements")
+    query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
+    run_psql(remote_compare, query, times=1, explain=False)

From c52384752e3fa1ef89bd4b7aa9a70b173f62dd78 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 19 Dec 2023 14:10:07 +0100
Subject: [PATCH 17/57] Compile `pg_semver` extension (#6184)

Closes #6183
---
 Dockerfile.compute-node | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 03280586f8..a23e930c48 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -569,6 +569,23 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-semver-pg-build"
+# compile pg_semver extension
+#
+#########################################################################################
+FROM build-deps AS pg-semver-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
+    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
+
 #########################################################################################
 #
 # Layer "pg-embedding-pg-build"
@@ -768,6 +785,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/

From a2fab3437161fd80c205fcab61ae88a82bd9cee5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 14:16:53 +0100
Subject: [PATCH 18/57] Update zstd to 0.13 (#6187)

This updates the `zstd` crate to 0.13, and `zstd-sys` with it (it
contains C so we should always run the newest version of that).
---
 Cargo.lock                | 18 ++++++++----------
 compute_tools/Cargo.toml  |  2 +-
 workspace_hack/Cargo.toml |  4 ++--
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9a367effbb..7821b3658a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,9 +190,9 @@ dependencies = [
 
 [[package]]
 name = "async-compression"
-version = "0.4.0"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
+checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
 dependencies = [
  "flate2",
  "futures-core",
@@ -6412,30 +6412,28 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
 
 [[package]]
 name = "zstd"
-version = "0.12.4"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
 dependencies = [
  "zstd-safe",
 ]
 
 [[package]]
 name = "zstd-safe"
-version = "6.0.6"
+version = "7.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
 dependencies = [
- "libc",
  "zstd-sys",
 ]
 
 [[package]]
 name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
+version = "2.0.9+zstd.1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
 dependencies = [
  "cc",
- "libc",
  "pkg-config",
 ]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 47378f1910..18b30810b0 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -37,5 +37,5 @@ workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
-zstd = "0.12.4"
+zstd = "0.13"
 bytes = "1.0"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 4621a75c0b..ffff0fda61 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -75,8 +75,8 @@ tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4"] }
-zstd = { version = "0.12" }
-zstd-safe = { version = "6", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
+zstd = { version = "0.13" }
+zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [build-dependencies]

From e6bf6952b86b4ee757cd64ab187a78cc6715a5cb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 14:46:17 +0100
Subject: [PATCH 19/57] higher resolution histograms for getpage@lsn (#6177)

part of https://github.com/neondatabase/cloud/issues/7811
---
 pageserver/src/metrics.rs | 52 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 45c01b71d1..3f0bc3e0a7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1019,12 +1019,62 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
+    [
+        1,
+        10,
+        20,
+        40,
+        60,
+        80,
+        100,
+        200,
+        300,
+        400,
+        500,
+        600,
+        700,
+        800,
+        900,
+        1_000, // 1ms
+        2_000,
+        4_000,
+        6_000,
+        8_000,
+        10_000, // 10ms
+        20_000,
+        40_000,
+        60_000,
+        80_000,
+        100_000,
+        200_000,
+        400_000,
+        600_000,
+        800_000,
+        1_000_000, // 1s
+        2_000_000,
+        4_000_000,
+        6_000_000,
+        8_000_000,
+        10_000_000, // 10s
+        20_000_000,
+        50_000_000,
+        100_000_000,
+        200_000_000,
+        1_000_000_000, // 1000s
+    ]
+    .into_iter()
+    .map(Duration::from_micros)
+    .map(|d| d.as_secs_f64())
+    .collect()
+});
+
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
         "Time spent on smgr query handling, aggregated by query type.",
         &["smgr_query_type"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
     .expect("failed to define a metric")
 });

From 8b91bbc38ef385d2d7dbbb0ce1ee66b8deec6302 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 16:45:17 +0100
Subject: [PATCH 20/57] Update jsonwebtoken to 9 and sct to 0.7.1 (#6189)

This increases the list of crates that base on `ring` 0.17.
---
 Cargo.lock                | 28 +++++++++++++++-------------
 Cargo.toml                |  2 +-
 workspace_hack/Cargo.toml |  2 +-
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7821b3658a..0e51e88e3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2487,13 +2487,14 @@ dependencies = [
 
 [[package]]
 name = "jsonwebtoken"
-version = "8.3.0"
+version = "9.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
+checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
  "base64 0.21.1",
- "pem 1.1.1",
- "ring 0.16.20",
+ "js-sys",
+ "pem 3.0.3",
+ "ring 0.17.6",
  "serde",
  "serde_json",
  "simple_asn1",
@@ -3291,18 +3292,19 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
 [[package]]
 name = "pem"
-version = "1.1.1"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8"
+checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
 dependencies = [
- "base64 0.13.1",
+ "base64 0.21.1",
+ "serde",
 ]
 
 [[package]]
 name = "pem"
-version = "2.0.1"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
+checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
 dependencies = [
  "base64 0.21.1",
  "serde",
@@ -4428,12 +4430,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
 [[package]]
 name = "sct"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
+ "ring 0.17.6",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index b44544d626..6884de7bf5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -91,7 +91,7 @@ hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
-jsonwebtoken = "8"
+jsonwebtoken = "9"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index ffff0fda61..82bbedc4ae 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -56,7 +56,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
-ring = { version = "0.16", features = ["std"] }
+ring = { version = "0.16" }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }

From 8ff5387da142a0129a884618a5b1a3159a46f544 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 19 Dec 2023 18:17:11 +0200
Subject: [PATCH 21/57] eliminate GCC warning for unchecked result of fread
 (#6167)

## Problem


GCCproduce warning that bread result is not checked. It doesn't affect
program logic, but better live without warnings.

## Summary of changes

Check read result.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/walproposer_pg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 79498b64af..9361f08ad2 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1712,9 +1712,9 @@ walprop_pg_after_election(WalProposer *wp)
 	f = fopen("restart.lsn", "rb");
 	if (f != NULL && !wp->config->syncSafekeepers)
 	{
-		fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+		size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
 		fclose(f);
-		if (lrRestartLsn != InvalidXLogRecPtr)
+		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
 		{
 			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 

From 0bd79eb063282408d6cc424d508bae86ff5dae4c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 19 Dec 2023 16:27:47 +0000
Subject: [PATCH 22/57] Handle role deletion when project has no databases.
 (#6170)

There is still default 'postgres' database, that may contain objects
owned by the role or some ACLs. We need to reassign objects in this
database too.

## Problem
If customer deleted all databases and then tries to delete role, that
has some non-standard ACLs,
`apply_config` operation will stuck because of failing role deletion.
---
 compute_tools/src/spec.rs | 52 +++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 20299c8fde..d545858dc2 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -370,33 +370,49 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
     Ok(())
 }
 
+fn reassign_owned_objects_in_one_db(
+    conf: Config,
+    role_name: &PgIdent,
+    db_owner: &PgIdent,
+) -> Result<()> {
+    let mut client = conf.connect(NoTls)?;
+
+    // This will reassign all dependent objects to the db owner
+    let reassign_query = format!(
+        "REASSIGN OWNED BY {} TO {}",
+        role_name.pg_quote(),
+        db_owner.pg_quote()
+    );
+    info!(
+        "reassigning objects owned by '{}' in db '{}' to '{}'",
+        role_name,
+        conf.get_dbname().unwrap_or(""),
+        db_owner
+    );
+    client.simple_query(&reassign_query)?;
+
+    // This now will only drop privileges of the role
+    let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
+    client.simple_query(&drop_query)?;
+    Ok(())
+}
+
 // Reassign all owned objects in all databases to the owner of the database.
 fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
     for db in &spec.cluster.databases {
         if db.owner != *role_name {
             let mut conf = Config::from_str(connstr)?;
             conf.dbname(&db.name);
-
-            let mut client = conf.connect(NoTls)?;
-
-            // This will reassign all dependent objects to the db owner
-            let reassign_query = format!(
-                "REASSIGN OWNED BY {} TO {}",
-                role_name.pg_quote(),
-                db.owner.pg_quote()
-            );
-            info!(
-                "reassigning objects owned by '{}' in db '{}' to '{}'",
-                role_name, &db.name, &db.owner
-            );
-            client.simple_query(&reassign_query)?;
-
-            // This now will only drop privileges of the role
-            let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
-            client.simple_query(&drop_query)?;
+            reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
         }
     }
 
+    // Also handle case when there are no databases in the spec.
+    // In this case we need to reassign objects in the default database.
+    let conf = Config::from_str(connstr)?;
+    let db_owner = PgIdent::from_str("cloud_admin")?;
+    reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
+
     Ok(())
 }
 

From 82809d2ec2491fec8ab06219de3d2d03bbb92275 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 17:44:49 +0100
Subject: [PATCH 23/57] fix metric
 `pageserver_initial_logical_size_start_calculation` (#6191)

It wasn't being incremented.

Fixup of

    commit 1c88824ed0e6bfbce02fa92e13ca91d5ab0e37b3
    Author: Christian Schwarz <christian@neon.tech>
    Date:   Fri Dec 1 12:52:59 2023 +0100

        initial logical size calculation: add a bunch of metrics (#5995)
---
 pageserver/src/metrics.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3f0bc3e0a7..4725903783 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -522,14 +522,18 @@ pub(crate) mod initial_logical_size {
     impl StartCalculation {
         pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
+            self.0
+                .with_label_values(&["first", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
             }
         }
         pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
+            self.0
+                .with_label_values(&["retry", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
             }

From 613906acea9a4f98920ba518301ea8195e1687a1 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 19 Dec 2023 21:06:43 +0400
Subject: [PATCH 24/57] Support custom types in broker (#5761)

Old methods are unchanged for backwards compatibility. Added
`SafekeeperDiscoveryRequest` and `SafekeeperDiscoveryResponse` types to
serve as example, and also as a prerequisite for
https://github.com/neondatabase/neon/issues/5471
---
 storage_broker/benches/rps.rs            |  29 ++-
 storage_broker/proto/broker.proto        |  58 +++++
 storage_broker/src/bin/storage_broker.rs | 277 +++++++++++++++++++----
 storage_broker/src/metrics.rs            |  34 ++-
 4 files changed, 350 insertions(+), 48 deletions(-)

diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index a0c8e1f749..d66cbefa45 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -3,9 +3,12 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use clap::Parser;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
+
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SubscribeByFilterRequest,
+    TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage,
+};
 
 use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
 use tokio::time;
@@ -91,15 +94,23 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
         None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
 
-    let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+    let ttid = ProtoTenantTimelineId {
         tenant_id: vec![0xFF; 16],
         timeline_id: tli_from_u64(i),
-    });
-    let request = SubscribeSafekeeperInfoRequest {
-        subscription_key: Some(key),
     };
-    let mut stream = client
-        .subscribe_safekeeper_info(request)
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperTimelineInfo.into(),
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: true,
+            tenant_timeline_id: Some(ttid),
+        }),
+    };
+
+    let mut stream: tonic::Streaming<TypedMessage> = client
+        .subscribe_by_filter(request)
         .await
         .unwrap()
         .into_inner();
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index aa9d62a29f..7d1b63d23f 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -10,6 +10,12 @@ service BrokerService {
 
     // Publish safekeeper updates.
     rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {};
+
+    // Subscribe to all messages, limited by a filter.
+    rpc SubscribeByFilter(SubscribeByFilterRequest) returns (stream TypedMessage) {};
+
+    // Publish one message.
+    rpc PublishOne(TypedMessage) returns (google.protobuf.Empty) {};
 }
 
 message SubscribeSafekeeperInfoRequest {
@@ -48,3 +54,55 @@ message TenantTimelineId {
     bytes tenant_id = 1;
     bytes timeline_id = 2;
 }
+
+message FilterTenantTimelineId {
+    // If true, only messages related to `tenant_timeline_id` will be emitted.
+    // Otherwise, messages for all timelines will be emitted.
+    bool enabled = 1;
+    TenantTimelineId tenant_timeline_id = 2;
+}
+
+message TypeSubscription {
+    MessageType type = 1;
+}
+
+message SubscribeByFilterRequest {
+    // Subscription will emit messages only of the specified types. You need to specify
+    // at least one type to receive any messages.
+    repeated TypeSubscription types = 1;
+
+    // If set and enabled, subscription will emit messages only for the specified tenant/timeline.
+    optional FilterTenantTimelineId tenant_timeline_id = 2;
+}
+
+enum MessageType {
+    UNKNOWN = 0;
+    SAFEKEEPER_TIMELINE_INFO = 2;
+    SAFEKEEPER_DISCOVERY_REQUEST = 3;
+    SAFEKEEPER_DISCOVERY_RESPONSE = 4;
+}
+
+// A message with a type.
+message TypedMessage {
+    MessageType type = 1;
+
+    optional SafekeeperTimelineInfo safekeeper_timeline_info = 2;
+    optional SafekeeperDiscoveryRequest safekeeper_discovery_request = 3;
+    optional SafekeeperDiscoveryResponse safekeeper_discovery_response = 4;
+}
+
+message SafekeeperDiscoveryRequest {
+    TenantTimelineId tenant_timeline_id = 1;
+}
+
+// Shorter version of SafekeeperTimelineInfo, contains only necessary fields.
+message SafekeeperDiscoveryResponse {
+    uint64 safekeeper_id = 1;
+    TenantTimelineId tenant_timeline_id = 2;
+    // WAL available to download.
+    uint64 commit_lsn = 3;
+    // A connection string to use for WAL downloading.
+    string safekeeper_connstr = 4;
+    // Availability zone of a safekeeper.
+    optional string availability_zone = 5;
+}
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 9f81ac6cac..4e5f8ed724 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -35,10 +35,16 @@ use tracing::*;
 use utils::signals::ShutdownSignals;
 
 use metrics::{Encoder, TextEncoder};
-use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
+use storage_broker::metrics::{
+    BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL,
+    NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL,
+};
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage,
+};
 use storage_broker::{
     parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
 };
@@ -73,8 +79,103 @@ struct Args {
     log_format: String,
 }
 
-type PubId = u64; // id of publisher for registering in maps
-type SubId = u64; // id of subscriber for registering in maps
+/// Id of publisher for registering in maps
+type PubId = u64;
+
+/// Id of subscriber for registering in maps
+type SubId = u64;
+
+/// Single enum type for all messages.
+#[derive(Clone, Debug, PartialEq)]
+#[allow(clippy::enum_variant_names)]
+enum Message {
+    SafekeeperTimelineInfo(SafekeeperTimelineInfo),
+    SafekeeperDiscoveryRequest(SafekeeperDiscoveryRequest),
+    SafekeeperDiscoveryResponse(SafekeeperDiscoveryResponse),
+}
+
+impl Message {
+    /// Convert proto message to internal message.
+    pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
+        match proto_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
+                proto_msg.safekeeper_timeline_info.ok_or_else(|| {
+                    Status::new(Code::InvalidArgument, "missing safekeeper_timeline_info")
+                })?,
+            )),
+            MessageType::SafekeeperDiscoveryRequest => Ok(Message::SafekeeperDiscoveryRequest(
+                proto_msg.safekeeper_discovery_request.ok_or_else(|| {
+                    Status::new(
+                        Code::InvalidArgument,
+                        "missing safekeeper_discovery_request",
+                    )
+                })?,
+            )),
+            MessageType::SafekeeperDiscoveryResponse => Ok(Message::SafekeeperDiscoveryResponse(
+                proto_msg.safekeeper_discovery_response.ok_or_else(|| {
+                    Status::new(
+                        Code::InvalidArgument,
+                        "missing safekeeper_discovery_response",
+                    )
+                })?,
+            )),
+            MessageType::Unknown => Err(Status::new(
+                Code::InvalidArgument,
+                format!("invalid message type: {:?}", proto_msg.r#type),
+            )),
+        }
+    }
+
+    /// Get the tenant_timeline_id from the message.
+    pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
+        match self {
+            Message::SafekeeperTimelineInfo(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+            Message::SafekeeperDiscoveryRequest(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+            Message::SafekeeperDiscoveryResponse(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+        }
+    }
+
+    /// Convert internal message to the protobuf struct.
+    pub fn as_typed_message(&self) -> TypedMessage {
+        let mut res = TypedMessage {
+            r#type: self.message_type() as i32,
+            ..Default::default()
+        };
+        match self {
+            Message::SafekeeperTimelineInfo(msg) => {
+                res.safekeeper_timeline_info = Some(msg.clone())
+            }
+            Message::SafekeeperDiscoveryRequest(msg) => {
+                res.safekeeper_discovery_request = Some(msg.clone())
+            }
+            Message::SafekeeperDiscoveryResponse(msg) => {
+                res.safekeeper_discovery_response = Some(msg.clone())
+            }
+        }
+        res
+    }
+
+    /// Get the message type.
+    pub fn message_type(&self) -> MessageType {
+        match self {
+            Message::SafekeeperTimelineInfo(_) => MessageType::SafekeeperTimelineInfo,
+            Message::SafekeeperDiscoveryRequest(_) => MessageType::SafekeeperDiscoveryRequest,
+            Message::SafekeeperDiscoveryResponse(_) => MessageType::SafekeeperDiscoveryResponse,
+        }
+    }
+}
 
 #[derive(Copy, Clone, Debug)]
 enum SubscriptionKey {
@@ -83,7 +184,7 @@ enum SubscriptionKey {
 }
 
 impl SubscriptionKey {
-    // Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
+    /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
     pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
         match key {
             ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
@@ -92,14 +193,29 @@ impl SubscriptionKey {
             }
         }
     }
+
+    /// Parse from FilterTenantTimelineId
+    pub fn from_proto_filter_tenant_timeline_id(
+        f: &FilterTenantTimelineId,
+    ) -> Result<Self, Status> {
+        if !f.enabled {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let ttid =
+            parse_proto_ttid(f.tenant_timeline_id.as_ref().ok_or_else(|| {
+                Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
+            })?)?;
+        Ok(SubscriptionKey::Timeline(ttid))
+    }
 }
 
-// Channel to timeline subscribers.
+/// Channel to timeline subscribers.
 struct ChanToTimelineSub {
-    chan: broadcast::Sender<SafekeeperTimelineInfo>,
-    // Tracked separately to know when delete the shmem entry. receiver_count()
-    // is unhandy for that as unregistering and dropping the receiver side
-    // happens at different moments.
+    chan: broadcast::Sender<Message>,
+    /// Tracked separately to know when delete the shmem entry. receiver_count()
+    /// is unhandy for that as unregistering and dropping the receiver side
+    /// happens at different moments.
     num_subscribers: u64,
 }
 
@@ -110,7 +226,7 @@ struct SharedState {
     num_subs_to_timelines: i64,
     chans_to_timeline_subs: HashMap<TenantTimelineId, ChanToTimelineSub>,
     num_subs_to_all: i64,
-    chan_to_all_subs: broadcast::Sender<SafekeeperTimelineInfo>,
+    chan_to_all_subs: broadcast::Sender<Message>,
 }
 
 impl SharedState {
@@ -146,7 +262,7 @@ impl SharedState {
         &mut self,
         sub_key: SubscriptionKey,
         timeline_chan_size: usize,
-    ) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
+    ) -> (SubId, broadcast::Receiver<Message>) {
         let sub_id = self.next_sub_id;
         self.next_sub_id += 1;
         let sub_rx = match sub_key {
@@ -262,6 +378,29 @@ impl Registry {
             subscriber.id, subscriber.key, subscriber.remote_addr
         );
     }
+
+    /// Send msg to relevant subscribers.
+    pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
+        PROCESSED_MESSAGES_TOTAL.inc();
+
+        // send message to subscribers for everything
+        let shared_state = self.shared_state.read();
+        // Err means there is no subscribers, it is fine.
+        shared_state.chan_to_all_subs.send(msg.clone()).ok();
+
+        // send message to per timeline subscribers, if there is ttid
+        let ttid = msg.tenant_timeline_id()?;
+        if let Some(ttid) = ttid {
+            if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
+                // Err can't happen here, as tx is destroyed only after removing
+                // from the map the last subscriber along with tx.
+                subs.chan
+                    .send(msg.clone())
+                    .expect("rx is still in the map with zero subscribers");
+            }
+        }
+        Ok(())
+    }
 }
 
 // Private subscriber state.
@@ -269,7 +408,7 @@ struct Subscriber {
     id: SubId,
     key: SubscriptionKey,
     // Subscriber receives messages from publishers here.
-    sub_rx: broadcast::Receiver<SafekeeperTimelineInfo>,
+    sub_rx: broadcast::Receiver<Message>,
     // to unregister itself from shared state in Drop
     registry: Registry,
     // for logging
@@ -291,26 +430,9 @@ struct Publisher {
 }
 
 impl Publisher {
-    // Send msg to relevant subscribers.
-    pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> {
-        // send message to subscribers for everything
-        let shared_state = self.registry.shared_state.read();
-        // Err means there is no subscribers, it is fine.
-        shared_state.chan_to_all_subs.send(msg.clone()).ok();
-
-        // send message to per timeline subscribers
-        let ttid =
-            parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| {
-                Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
-            })?)?;
-        if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
-            // Err can't happen here, as tx is destroyed only after removing
-            // from the map the last subscriber along with tx.
-            subs.chan
-                .send(msg.clone())
-                .expect("rx is still in the map with zero subscribers");
-        }
-        Ok(())
+    /// Send msg to relevant subscribers.
+    pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
+        self.registry.send_msg(msg)
     }
 }
 
@@ -339,7 +461,7 @@ impl BrokerService for Broker {
 
         loop {
             match stream.next().await {
-                Some(Ok(msg)) => publisher.send_msg(&msg)?,
+                Some(Ok(msg)) => publisher.send_msg(&Message::SafekeeperTimelineInfo(msg))?,
                 Some(Err(e)) => return Err(e), // grpc error from the stream
                 None => break,                 // closed stream
             }
@@ -371,8 +493,15 @@ impl BrokerService for Broker {
             let mut missed_msgs: u64 = 0;
             loop {
                 match subscriber.sub_rx.recv().await {
-                    Ok(info) => yield info,
+                    Ok(info) => {
+                        match info {
+                            Message::SafekeeperTimelineInfo(info) => yield info,
+                            _ => {},
+                        }
+                        BROADCASTED_MESSAGES_TOTAL.inc();
+                    },
                     Err(RecvError::Lagged(skipped_msg)) => {
+                        BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
                         missed_msgs += skipped_msg;
                         if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
                             warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
@@ -392,6 +521,78 @@ impl BrokerService for Broker {
             Box::pin(output) as Self::SubscribeSafekeeperInfoStream
         ))
     }
+
+    type SubscribeByFilterStream =
+        Pin<Box<dyn Stream<Item = Result<TypedMessage, Status>> + Send + 'static>>;
+
+    /// Subscribe to all messages, limited by a filter.
+    async fn subscribe_by_filter(
+        &self,
+        request: Request<SubscribeByFilterRequest>,
+    ) -> std::result::Result<Response<Self::SubscribeByFilterStream>, Status> {
+        let remote_addr = request
+            .remote_addr()
+            .expect("TCPConnectInfo inserted by handler");
+        let proto_filter = request.into_inner();
+        let ttid_filter = proto_filter
+            .tenant_timeline_id
+            .as_ref()
+            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+
+        let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
+        let types_set = proto_filter
+            .types
+            .iter()
+            .map(|t| t.r#type)
+            .collect::<std::collections::HashSet<_>>();
+
+        let mut subscriber = self.registry.register_subscriber(sub_key, remote_addr);
+
+        // transform rx into stream with item = Result, as method result demands
+        let output = async_stream::try_stream! {
+            let mut warn_interval = time::interval(Duration::from_millis(1000));
+            let mut missed_msgs: u64 = 0;
+            loop {
+                match subscriber.sub_rx.recv().await {
+                    Ok(msg) => {
+                        let msg_type = msg.message_type() as i32;
+                        if types_set.contains(&msg_type) {
+                            yield msg.as_typed_message();
+                            BROADCASTED_MESSAGES_TOTAL.inc();
+                        }
+                    },
+                    Err(RecvError::Lagged(skipped_msg)) => {
+                        BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
+                        missed_msgs += skipped_msg;
+                        if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
+                            warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
+                            missed_msgs = 0;
+                        }
+                    }
+                    Err(RecvError::Closed) => {
+                        // can't happen, we never drop the channel while there is a subscriber
+                        Err(Status::new(Code::Internal, "channel unexpectantly closed"))?;
+                    }
+                }
+            }
+        };
+
+        Ok(Response::new(
+            Box::pin(output) as Self::SubscribeByFilterStream
+        ))
+    }
+
+    /// Publish one message.
+    async fn publish_one(
+        &self,
+        request: Request<TypedMessage>,
+    ) -> std::result::Result<Response<()>, Status> {
+        let msg = Message::from(request.into_inner())?;
+        PUBLISHED_ONEOFF_MESSAGES_TOTAL.inc();
+        self.registry.send_msg(&msg)?;
+        Ok(Response::new(()))
+    }
 }
 
 // We serve only metrics and healthcheck through http1.
@@ -515,8 +716,8 @@ mod tests {
     use tokio::sync::broadcast::error::TryRecvError;
     use utils::id::{TenantId, TimelineId};
 
-    fn msg(timeline_id: Vec<u8>) -> SafekeeperTimelineInfo {
-        SafekeeperTimelineInfo {
+    fn msg(timeline_id: Vec<u8>) -> Message {
+        Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo {
             safekeeper_id: 1,
             tenant_timeline_id: Some(ProtoTenantTimelineId {
                 tenant_id: vec![0x00; 16],
@@ -533,7 +734,7 @@ mod tests {
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
             local_start_lsn: 0,
             availability_zone: None,
-        }
+        })
     }
 
     fn tli_from_u64(i: u64) -> Vec<u8> {
diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs
index f0649d0f68..1fd3dd5ad6 100644
--- a/storage_broker/src/metrics.rs
+++ b/storage_broker/src/metrics.rs
@@ -1,6 +1,6 @@
 //! Broker metrics.
 
-use metrics::{register_int_gauge, IntGauge};
+use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge};
 use once_cell::sync::Lazy;
 
 pub static NUM_PUBS: Lazy<IntGauge> = Lazy::new(|| {
@@ -23,3 +23,35 @@ pub static NUM_SUBS_ALL: Lazy<IntGauge> = Lazy::new(|| {
     )
     .expect("Failed to register metric")
 });
+
+pub static PROCESSED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_processed_messages_total",
+        "Number of messages received by storage broker, before routing and broadcasting"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static BROADCASTED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_broadcasted_messages_total",
+        "Number of messages broadcasted (sent over network) to subscribers"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static BROADCAST_DROPPED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_broadcast_dropped_messages_total",
+        "Number of messages dropped due to channel capacity overflow"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static PUBLISHED_ONEOFF_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_published_oneoff_messages_total",
+        "Number of one-off messages sent via PublishOne method"
+    )
+    .expect("Failed to register metric")
+});

From 58dbca6ce3bda243db96d4e24b146c059d231467 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 19 Dec 2023 15:48:41 -0800
Subject: [PATCH 25/57] Bump vm-builder v0.19.0 -> v0.21.0 (#6197)

Only applicable change was neondatabase/autoscaling#650, reducing the
vector scrape interval (inside the VM) from 15 seconds to 1 second.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 693ed1a66f..6cb6d9df02 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -857,7 +857,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.19.0
+      VM_BUILDER_VERSION: v0.21.0
 
     steps:
       - name: Checkout

From c29df806341d790f9c08e7b9c1f5fd615fb43118 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Dec 2023 10:26:06 +0200
Subject: [PATCH 26/57] fix(layer): move backoff to spawned task (#5746)

Move the backoff to spawned task as it can still be useful; make the
sleep cancellable.
---
 pageserver/src/tenant/storage_layer/layer.rs | 33 +++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 9a8ddc1a6b..8ae911b31e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -878,6 +878,23 @@ impl LayerInner {
                         Ok(())
                     }
                     Err(e) => {
+                        let consecutive_failures =
+                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+
+                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
+                            consecutive_failures.min(u32::MAX as usize) as u32,
+                            1.5,
+                            60.0,
+                        );
+
+                        let backoff = std::time::Duration::from_secs_f64(backoff);
+
+                        tokio::select! {
+                            _ = tokio::time::sleep(backoff) => {},
+                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
+                            _ = timeline.cancel.cancelled() => {},
+                        };
+
                         Err(e)
                     }
                 };
@@ -926,21 +943,9 @@ impl LayerInner {
                 Ok(permit)
             }
             Ok((Err(e), _permit)) => {
-                // FIXME: this should be with the spawned task and be cancellation sensitive
-                //
-                // while we should not need this, this backoff has turned out to be useful with
-                // a bug of unexpectedly deleted remote layer file (#5787).
-                let consecutive_failures =
-                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+                // sleep already happened in the spawned task, if it was not cancelled
+                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                let backoff = utils::backoff::exponential_backoff_duration_seconds(
-                    consecutive_failures.min(u32::MAX as usize) as u32,
-                    1.5,
-                    60.0,
-                );
-                let backoff = std::time::Duration::from_secs_f64(backoff);
-
-                tokio::time::sleep(backoff).await;
                 Err(DownloadError::DownloadFailed)
             }
             Err(_gone) => Err(DownloadError::DownloadCancelled),

From f260f1565e220eb90a3ffe2fd15597735d156d5c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Dec 2023 12:26:20 +0000
Subject: [PATCH 27/57] pageserver: fixes + test updates for sharding (#6186)

This is a precursor to:
- https://github.com/neondatabase/neon/pull/6185

While that PR contains big changes to neon_local and attachment_service,
this PR contains a few unrelated standalone changes generated while
working on that branch:
- Fix restarting a pageserver when it contains multiple shards for the
same tenant
- When using location_config api to attach a tenant, create its
timelines dir
- Update test paths where generations were previously optional to make
them always-on: this avoids tests having to spuriously assert that
attachment_service is not None in order to make the linter happy.
- Add a TenantShardId python implementation for subsequent use in test
helpers that will be made shard-aware
- Teach scrubber to read across shards when checking for layer
existence: this is a refactor to track the list of existent layers at
tenant-level rather than locally to each timeline. This is a precursor
to testing shard splitting.
---
 libs/pageserver_api/src/shard.rs              |   2 +-
 pageserver/src/tenant/mgr.rs                  |  49 +++---
 s3_scrubber/src/checks.rs                     | 144 +++++++++++-------
 s3_scrubber/src/scan_metadata.rs              | 126 +++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |  47 ++----
 test_runner/fixtures/types.py                 |  48 ++++++
 test_runner/performance/test_bulk_insert.py   |   1 -
 .../regress/test_attach_tenant_config.py      |   5 +-
 test_runner/regress/test_change_pageserver.py |   1 -
 .../regress/test_layers_from_future.py        |   1 -
 test_runner/regress/test_pageserver_api.py    |   4 +-
 .../regress/test_pageserver_generations.py    |  10 --
 .../regress/test_pageserver_restart.py        |   4 +-
 .../regress/test_pageserver_secondary.py      |   4 -
 test_runner/regress/test_remote_storage.py    |   2 -
 15 files changed, 293 insertions(+), 155 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 052fbd1402..3668f7939d 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -159,7 +159,7 @@ impl From<[u8; 18]> for TenantShardId {
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
     pub shard_count: ShardCount,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b2f14db9f7..31d80026f0 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -514,10 +514,7 @@ pub async fn init_tenant_mgr(
             &ctx,
         ) {
             Ok(tenant) => {
-                tenants.insert(
-                    TenantShardId::unsharded(tenant.tenant_id()),
-                    TenantSlot::Attached(tenant),
-                );
+                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
             }
             Err(e) => {
                 error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
@@ -962,35 +959,27 @@ impl TenantManager {
         }
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+
+        // Directory structure is the same for attached and secondary modes:
+        // create it if it doesn't exist.  Timeline load/creation expects the
+        // timelines/ subdir to already exist.
+        //
+        // Does not need to be fsync'd because local storage is just a cache.
+        tokio::fs::create_dir_all(&timelines_path)
+            .await
+            .with_context(|| format!("Creating {timelines_path}"))?;
+
+        // Before activating either secondary or attached mode, persist the
+        // configuration, so that on restart we will re-attach (or re-start
+        // secondary) on the tenant.
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
 
         let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => {
-                // Directory doesn't need to be fsync'd because if we crash it can
-                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {tenant_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                TenantSlot::Secondary
-            }
+            LocationMode::Secondary(_) => TenantSlot::Secondary,
             LocationMode::Attached(_attach_config) => {
-                let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-
-                // Directory doesn't need to be fsync'd because we do not depend on
-                // it to exist after crashes: it may be recreated when tenant is
-                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {timelines_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
                 let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
                     self.conf,
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 2acbb2352b..7b9f96dce3 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,9 +1,12 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
+use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
@@ -40,7 +43,7 @@ impl TimelineAnalysis {
 
 pub(crate) fn branch_cleanup_and_check_errors(
     id: &TenantShardTimelineId,
-    s3_root: &RootTarget,
+    tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
     s3_data: Option<S3TimelineBlobData>,
@@ -72,8 +75,8 @@ pub(crate) fn branch_cleanup_and_check_errors(
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
                     index_part,
-                    index_part_generation,
-                    mut s3_layers,
+                    index_part_generation: _index_part_generation,
+                    s3_layers: _s3_layers,
                 } => {
                     if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
                         result.errors.push(format!(
@@ -111,65 +114,19 @@ pub(crate) fn branch_cleanup_and_check_errors(
                             ))
                         }
 
-                        let layer_map_key = (layer, metadata.generation);
-                        if !s3_layers.remove(&layer_map_key) {
+                        if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
                             // FIXME: this will emit false positives if an index was
                             // uploaded concurrently with our scan.  To make this check
                             // correct, we need to try sending a HEAD request for the
                             // layer we think is missing.
                             result.errors.push(format!(
-                                "index_part.json contains a layer {}{} that is not present in remote storage",
-                                layer_map_key.0.file_name(),
-                                layer_map_key.1.get_suffix()
+                                "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
+                                layer.file_name(),
+                                metadata.generation.get_suffix(),
+                                metadata.shard
                             ))
                         }
                     }
-
-                    let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
-                        .into_iter()
-                        .filter(|(_layer_name, gen)|
-                            // A layer is only considered orphaned if it has a generation below
-                            // the index.  If the generation is >= the index, then the layer may
-                            // be an upload from a running pageserver, or even an upload from
-                            // a new generation that didn't upload an index yet.
-                            //
-                            // Even so, a layer that is not referenced by the index could just
-                            // be something enqueued for deletion, so while this check is valid
-                            // for indicating that a layer is garbage, it is not an indicator
-                            // of a problem.
-                            gen < &index_part_generation)
-                        .collect();
-
-                    if !orphan_layers.is_empty() {
-                        // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
-                        // these as a hint that there is something worth cleaning up here.
-                        result.warnings.push(format!(
-                            "index_part.json does not contain layers from S3: {:?}",
-                            orphan_layers
-                                .iter()
-                                .map(|(layer_name, gen)| format!(
-                                    "{}{}",
-                                    layer_name.file_name(),
-                                    gen.get_suffix()
-                                ))
-                                .collect::<Vec<_>>(),
-                        ));
-                        result.garbage_keys.extend(orphan_layers.iter().map(
-                            |(layer_name, layer_gen)| {
-                                let mut key = s3_root.timeline_root(id).prefix_in_bucket;
-                                let delimiter = s3_root.delimiter();
-                                if !key.ends_with(delimiter) {
-                                    key.push_str(delimiter);
-                                }
-                                key.push_str(&format!(
-                                    "{}{}",
-                                    &layer_name.file_name(),
-                                    layer_gen.get_suffix()
-                                ));
-                                key
-                            },
-                        ));
-                    }
                 }
                 BlobDataParseResult::Relic => {}
                 BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -204,6 +161,83 @@ pub(crate) fn branch_cleanup_and_check_errors(
     result
 }
 
+#[derive(Default)]
+pub(crate) struct LayerRef {
+    ref_count: usize,
+}
+
+/// Top-level index of objects in a tenant.  This may be used by any shard-timeline within
+/// the tenant to query whether an object exists.
+#[derive(Default)]
+pub(crate) struct TenantObjectListing {
+    shard_timelines:
+        HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
+}
+
+impl TenantObjectListing {
+    /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
+    /// list of layer keys for the Tenant.
+    pub(crate) fn push(
+        &mut self,
+        ttid: TenantShardTimelineId,
+        layers: HashSet<(LayerFileName, Generation)>,
+    ) {
+        let shard_index = ShardIndex::new(
+            ttid.tenant_shard_id.shard_number,
+            ttid.tenant_shard_id.shard_count,
+        );
+        let replaced = self.shard_timelines.insert(
+            (shard_index, ttid.timeline_id),
+            layers
+                .into_iter()
+                .map(|l| (l, LayerRef::default()))
+                .collect(),
+        );
+
+        assert!(
+            replaced.is_none(),
+            "Built from an S3 object listing, which should never repeat a key"
+        );
+    }
+
+    /// Having loaded a timeline index, check if a layer referenced by the index exists.  If it does,
+    /// the layer's refcount will be incremented.  Later, after calling this for all references in all indices
+    /// in a tenant, orphan layers may be detected by their zero refcounts.
+    ///
+    /// Returns true if the layer exists
+    pub(crate) fn check_ref(
+        &mut self,
+        timeline_id: TimelineId,
+        layer_file: &LayerFileName,
+        metadata: &IndexLayerMetadata,
+    ) -> bool {
+        let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
+            return false;
+        };
+
+        let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
+            return false;
+        };
+
+        layer_ref.ref_count += 1;
+
+        true
+    }
+
+    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
+        let mut result = Vec::new();
+        for ((shard_index, timeline_id), layers) in &self.shard_timelines {
+            for ((layer_file, generation), layer_ref) in layers {
+                if layer_ref.ref_count == 0 {
+                    result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
+                }
+            }
+        }
+
+        result
+    }
+}
+
 #[derive(Debug)]
 pub(crate) struct S3TimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 91347ca21b..bcc4d2e618 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -2,22 +2,25 @@ use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
     branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
-    TimelineAnalysis,
+    TenantObjectListing, TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
+use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
+use utils::id::TenantId;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
     count: usize,
     with_errors: HashSet<TenantShardTimelineId>,
     with_warnings: HashSet<TenantShardTimelineId>,
-    with_garbage: HashSet<TenantShardTimelineId>,
+    with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
 
     layer_count: MinMaxHisto,
@@ -87,7 +90,7 @@ impl MetadataSummary {
             count: 0,
             with_errors: HashSet::new(),
             with_warnings: HashSet::new(),
-            with_garbage: HashSet::new(),
+            with_orphans: HashSet::new(),
             indices_by_version: HashMap::new(),
             layer_count: MinMaxHisto::new(),
             timeline_size_bytes: MinMaxHisto::new(),
@@ -141,6 +144,10 @@ impl MetadataSummary {
         }
     }
 
+    fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) {
+        self.with_orphans.insert(*ttid);
+    }
+
     /// Long-form output for printing at end of a scan
     pub fn summary_string(&self) -> String {
         let version_summary: String = itertools::join(
@@ -154,7 +161,7 @@ impl MetadataSummary {
             "Timelines: {0}
 With errors: {1}
 With warnings: {2}
-With garbage: {3}
+With orphan layers: {3}
 Index versions: {version_summary}
 Timeline size bytes: {4}
 Layer size bytes: {5}
@@ -163,7 +170,7 @@ Timeline layer count: {6}
             self.count,
             self.with_errors.len(),
             self.with_warnings.len(),
-            self.with_garbage.len(),
+            self.with_orphans.len(),
             self.timeline_size_bytes.oneline(),
             self.layer_size_bytes.oneline(),
             self.layer_count.oneline(),
@@ -191,7 +198,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
 
     // Generate a stream of TenantTimelineId
     let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
-    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
@@ -204,17 +211,118 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_buffered(CONCURRENCY);
 
+    // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
+    // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
+
+    let mut tenant_id = None;
+    let mut tenant_objects = TenantObjectListing::default();
+    let mut tenant_timeline_results = Vec::new();
+
+    fn analyze_tenant(
+        tenant_id: TenantId,
+        summary: &mut MetadataSummary,
+        mut tenant_objects: TenantObjectListing,
+        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+    ) {
+        let mut timeline_generations = HashMap::new();
+        for (ttid, data) in timelines {
+            // Stash the generation of each timeline, for later use identifying orphan layers
+            if let BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation,
+                s3_layers: _s3_layers,
+            } = &data.blob_data
+            {
+                timeline_generations.insert(ttid, *index_part_generation);
+            }
+
+            // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+            // reference counts for layers across the tenant.
+            let analysis =
+                branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
+            summary.update_analysis(&ttid, &analysis);
+        }
+
+        // Identifying orphan layers must be done on a tenant-wide basis, because individual
+        // shards' layers may be referenced by other shards.
+        //
+        // Orphan layers are not a corruption, and not an indication of a problem.  They are just
+        // consuming some space in remote storage, and may be cleaned up at leisure.
+        for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() {
+            let ttid = TenantShardTimelineId {
+                tenant_shard_id: TenantShardId {
+                    tenant_id,
+                    shard_count: shard_index.shard_count,
+                    shard_number: shard_index.shard_number,
+                },
+                timeline_id,
+            };
+
+            if let Some(timeline_generation) = timeline_generations.get(&ttid) {
+                if &generation >= timeline_generation {
+                    // Candidate orphan layer is in the current or future generation relative
+                    // to the index we read for this timeline shard, so its absence from the index
+                    // doesn't make it an orphan: more likely, it is a case where the layer was
+                    // uploaded, but the index referencing the layer wasn't written yet.
+                    continue;
+                }
+            }
+
+            let orphan_path = remote_layer_path(
+                &tenant_id,
+                &timeline_id,
+                shard_index,
+                &layer_file,
+                generation,
+            );
+
+            tracing::info!("Orphan layer detected: {orphan_path}");
+
+            summary.notify_timeline_orphan(&ttid);
+        }
+    }
+
+    // Iterate through  all the timeline results.  These are in key-order, so
+    // all results for the same tenant will be adjacent.  We accumulate these,
+    // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
     pin_mut!(timelines);
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
 
-        let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
+        match tenant_id {
+            None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
+            Some(prev_tenant_id) => {
+                if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
+                    let tenant_objects = std::mem::take(&mut tenant_objects);
+                    let timelines = std::mem::take(&mut tenant_timeline_results);
+                    analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
+                    tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                }
+            }
+        }
 
-        summary.update_analysis(&ttid, &analysis);
+        if let BlobDataParseResult::Parsed {
+            index_part: _index_part,
+            index_part_generation: _index_part_generation,
+            s3_layers,
+        } = &data.blob_data
+        {
+            tenant_objects.push(ttid, s3_layers.clone());
+        }
+        tenant_timeline_results.push((ttid, data));
+    }
+
+    if !tenant_timeline_results.is_empty() {
+        analyze_tenant(
+            tenant_id.expect("Must be set if results are present"),
+            &mut summary,
+            tenant_objects,
+            tenant_timeline_results,
+        );
     }
 
     Ok(summary)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 42e122cefe..a9133f1c9c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -457,7 +457,6 @@ class NeonEnvBuilder:
         self.preserve_database_files = preserve_database_files
         self.initial_tenant = initial_tenant or TenantId.generate()
         self.initial_timeline = initial_timeline or TimelineId.generate()
-        self.enable_generations = True
         self.scrub_on_exit = False
         self.test_output_dir = test_output_dir
 
@@ -677,8 +676,7 @@ class NeonEnvBuilder:
 
                 pageserver.stop(immediate=True)
 
-            if self.env.attachment_service is not None:
-                self.env.attachment_service.stop(immediate=True)
+            self.env.attachment_service.stop(immediate=True)
 
             cleanup_error = None
 
@@ -772,13 +770,9 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        if config.enable_generations:
-            attachment_service_port = self.port_distributor.get_port()
-            self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
-            self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
-        else:
-            self.control_plane_api = None
-            self.attachment_service = None
+        attachment_service_port = self.port_distributor.get_port()
+        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
+        self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
 
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
@@ -851,8 +845,7 @@ class NeonEnv:
         # Start up broker, pageserver and all safekeepers
         self.broker.try_start()
 
-        if self.attachment_service is not None:
-            self.attachment_service.start()
+        self.attachment_service.start()
 
         for pageserver in self.pageservers:
             pageserver.start()
@@ -1834,20 +1827,19 @@ class NeonPageserver(PgProtocol):
         """
         client = self.http_client()
         return client.tenant_attach(
-            tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
+            tenant_id,
+            config,
+            config_null,
+            generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
         )
 
     def tenant_detach(self, tenant_id: TenantId):
-        if self.env.attachment_service is not None:
-            self.env.attachment_service.attach_hook_drop(tenant_id)
+        self.env.attachment_service.attach_hook_drop(tenant_id)
 
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
     def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
-        # This API is only for use when generations are enabled
-        assert self.env.attachment_service is not None
-
         if config["mode"].startswith("Attached") and "generation" not in config:
             config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
 
@@ -1873,26 +1865,15 @@ class NeonPageserver(PgProtocol):
         generation: Optional[int] = None,
     ) -> TenantId:
         if generation is None:
-            generation = self.maybe_get_generation(tenant_id)
+            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
         return client.tenant_create(tenant_id, conf, generation=generation)
 
     def tenant_load(self, tenant_id: TenantId):
         client = self.http_client()
-        return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
-
-    def maybe_get_generation(self, tenant_id: TenantId):
-        """
-        For tests that would like to use an HTTP client directly instead of using
-        the `tenant_attach` and `tenant_create` helpers here: issue a generation
-        number for a tenant.
-
-        Returns None if the attachment service is not enabled (legacy mode)
-        """
-        if self.env.attachment_service is not None:
-            return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
-        else:
-            return None
+        return client.tenant_load(
+            tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+        )
 
 
 def append_pageserver_param_overrides(
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index d95368f990..ea648e460d 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -125,3 +125,51 @@ class TenantId(Id):
 class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
+
+
+# Workaround for compat with python 3.9, which does not have `typing.Self`
+TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
+
+
+class TenantShardId:
+    def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
+        self.tenant_id = tenant_id
+        self.shard_number = shard_number
+        self.shard_count = shard_count
+        assert self.shard_number < self.shard_count or self.shard_count == 0
+
+    @classmethod
+    def parse(cls: Type[TTenantShardId], input) -> TTenantShardId:
+        if len(input) == 32:
+            return cls(
+                tenant_id=TenantId(input),
+                shard_number=0,
+                shard_count=0,
+            )
+        elif len(input) == 37:
+            return cls(
+                tenant_id=TenantId(input[0:32]),
+                shard_number=int(input[33:35], 16),
+                shard_count=int(input[35:37], 16),
+            )
+        else:
+            raise ValueError(f"Invalid TenantShardId '{input}'")
+
+    def __str__(self):
+        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+
+    def _tuple(self) -> tuple[TenantId, int, int]:
+        return (self.tenant_id, self.shard_number, self.shard_count)
+
+    def __lt__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+        return self._tuple() < other._tuple()
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+        return self._tuple() == other._tuple()
+
+    def __hash__(self) -> int:
+        return hash(self._tuple())
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index a2a1fa11e5..edc23b29ba 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -61,7 +61,6 @@ def measure_recovery_time(env: NeonCompare):
     # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
     # we will explicitly create the tenant in the same generation that it was previously
     # attached in.
-    assert env.env.attachment_service is not None
     attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
     assert attach_status is not None
     (attach_gen, _) = attach_status
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 352ec13884..32397bbcc1 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -136,10 +136,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
-    body = {}
-    gen = env.pageserver.maybe_get_generation(tenant_id)
-    if gen is not None:
-        body["generation"] = gen
+    body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
 
     ps_http.post(
         f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 1b6c982850..adb67a579e 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -87,7 +87,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     #
     # Since we're dual-attached, need to tip-off attachment service to treat the one we're
     # about to start as the attached pageserver
-    assert env.attachment_service is not None
     env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index ef2b2185c3..340188c1ae 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -157,7 +157,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
 
     def get_generation_number():
-        assert env.attachment_service is not None
         attachment = env.attachment_service.inspect(tenant_id)
         assert attachment is not None
         return attachment[0]
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 64e41a2dd5..573d2139ce 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -72,7 +72,9 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
 
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
-    client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
+    client.tenant_create(
+        tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+    )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
     timelines = client.timeline_list(tenant_id)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4488be31c5..9c2f5786d4 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -187,7 +187,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     - After upgrade, the bucket should contain a mixture.
     - In both cases, postgres I/O should work.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -196,7 +195,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.broker.try_start()
     for sk in env.safekeepers:
         sk.start()
-    assert env.attachment_service is not None
     env.attachment_service.start()
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
@@ -262,12 +260,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
 
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     some_other_pageserver = 1234
     ps_http = env.pageserver.http_client()
@@ -341,7 +337,6 @@ def test_deletion_queue_recovery(
     :param validate_before: whether to wait for deletions to be validated before restart.  This
     makes them elegible to be executed after restart, if the same node keeps the attachment.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -405,7 +400,6 @@ def test_deletion_queue_recovery(
 
     if keep_attachment == KeepAttachment.LOSE:
         some_other_pageserver = 101010
-        assert env.attachment_service is not None
         env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
     env.pageserver.start()
@@ -453,7 +447,6 @@ def test_deletion_queue_recovery(
 
 
 def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -473,7 +466,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # Simulate a major incident: the control plane goes offline
-    assert env.attachment_service is not None
     env.attachment_service.stop()
 
     # Remember how many validations had happened before the control plane went offline
@@ -545,7 +537,6 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
     and must be constructed using the proper generation for the layer, which may not be the same generation
     that the tenant is running in.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -575,7 +566,6 @@ def test_multi_attach(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
 ):
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 3cac32b790..c4499196b5 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -9,9 +9,7 @@ from fixtures.utils import wait_until
 
 # Test restarting page server, while safekeeper and compute node keep
 # running.
-@pytest.mark.parametrize("generations", [True, False])
-def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool):
-    neon_env_builder.enable_generations = generations
+def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 64ade346aa..8ae4297983 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -57,13 +57,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     states are valid, so that we may test it in this way: the API should always
     work as long as the tenant exists.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     pageservers = env.pageservers
     list([p.http_client() for p in pageservers])
@@ -210,13 +208,11 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     """
     Test the sequence of location states that are used in a live migration.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 2
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 3004d69f50..2fda56d0f4 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -60,8 +60,6 @@ def test_remote_storage_backup_and_restore(
 
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
-    neon_env_builder.enable_generations = generations
-
     # Exercise retry code path by making all uploads and downloads fail for the
     # first time. The retries print INFO-messages to the log; we will check
     # that they are present after the test.

From 0f56104a6120876c387fcecb10b8f76dcef77504 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 20 Dec 2023 19:06:55 +0400
Subject: [PATCH 28/57] Make sk_collect_dumps also possible with teleport
 (#4739)

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 scripts/sk_collect_dumps/.gitignore     |  2 +
 scripts/sk_collect_dumps/ansible.cfg    | 11 ++++++
 scripts/sk_collect_dumps/pyproject.toml | 16 ++++++++
 scripts/sk_collect_dumps/readme.md      | 50 +++++++++++++++++--------
 scripts/sk_collect_dumps/remote.yaml    | 33 ++++++++++++----
 scripts/sk_collect_dumps/ssh.cfg        | 13 +++++++
 scripts/sk_collect_dumps/upload.sh      | 26 ++++++-------
 7 files changed, 115 insertions(+), 36 deletions(-)
 create mode 100644 scripts/sk_collect_dumps/ansible.cfg
 create mode 100644 scripts/sk_collect_dumps/pyproject.toml
 create mode 100644 scripts/sk_collect_dumps/ssh.cfg

diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore
index d9d4d0296a..cdf99aefd7 100644
--- a/scripts/sk_collect_dumps/.gitignore
+++ b/scripts/sk_collect_dumps/.gitignore
@@ -1,2 +1,4 @@
 result
 *.json
+hosts
+poetry.lock
diff --git a/scripts/sk_collect_dumps/ansible.cfg b/scripts/sk_collect_dumps/ansible.cfg
new file mode 100644
index 0000000000..150986ab79
--- /dev/null
+++ b/scripts/sk_collect_dumps/ansible.cfg
@@ -0,0 +1,11 @@
+[defaults]
+host_key_checking = False
+inventory=./hosts
+remote_tmp=/tmp
+remote_user=developer
+callbacks_enabled = profile_tasks
+
+[ssh_connection]
+scp_if_ssh = True
+ssh_args = -F ./ssh.cfg
+pipelining = True
diff --git a/scripts/sk_collect_dumps/pyproject.toml b/scripts/sk_collect_dumps/pyproject.toml
new file mode 100644
index 0000000000..c6f6adafe2
--- /dev/null
+++ b/scripts/sk_collect_dumps/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "sk-collect-dumps"
+version = "0.1.0"
+description = ""
+authors = ["Arseny Sher <sher-ars@yandex.ru>"]
+readme = "README.md"
+packages = [{include = "sk_collect_dumps"}]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+ansible = "^9.1.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
index 52b73e9495..7494a6cb78 100644
--- a/scripts/sk_collect_dumps/readme.md
+++ b/scripts/sk_collect_dumps/readme.md
@@ -1,25 +1,43 @@
 # Collect /v1/debug_dump from all safekeeper nodes
 
-1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
-2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
-
-## How to use ansible (staging)
-
+3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
 ```
-AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+# staging:
+AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+# prod:
+AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+# check
+echo $AUTH_TOKEN
+```
+2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
 
-AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+There are two ways to do that, with ssm or tsh. ssm:
+```
+# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit):
+AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml
+```
+It will put the results to .results directory *near the playbook*.
+
+tsh:
+
+Update the inventory, if needed, selecting .build/.tech and optionally region:
+```
+rm -f hosts && echo '[safekeeper]' >> hosts
+# staging:
+tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts
+# prod:
+tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts
 ```
 
-## How to use ansible (prod)
-
+Test ansible connection:
 ```
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+ansible all -m ping -v
 ```
 
+Download the dumps:
+```
+mkdir -p result && rm -f result/*
+ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml
+```
+
+3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml
index 29ce83efde..f214d0ae2c 100644
--- a/scripts/sk_collect_dumps/remote.yaml
+++ b/scripts/sk_collect_dumps/remote.yaml
@@ -1,18 +1,37 @@
 - name: Fetch state dumps from safekeepers
-  hosts: safekeepers
+  hosts: safekeeper
   gather_facts: False
-  remote_user: "{{ remote_user }}"
     
   tasks:
-    - name: Download file
+    - name: Dump file
       get_url:
         url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
-        dest: "/tmp/{{ inventory_hostname }}.json"
+        dest: "/tmp/{{ inventory_hostname }}-dump.json"
+        headers:
+          Authorization: "Bearer {{ auth_token }}"
 
-    - name: Fetch file from remote hosts
+    - name: install rsync
+      ansible.builtin.apt:
+        name: rsync
+        update_cache: yes
+      become: yes
+      ignore_errors: true # it can be already installed and we don't always have sudo
+
+    - name: Fetch file from remote hosts (works only with ssm)
       fetch:
-        src: "/tmp/{{ inventory_hostname }}.json"
-        dest: "./result/{{ inventory_hostname }}.json"
+        src: "/tmp/{{ inventory_hostname }}-dump.json"
+        dest: "./result/{{ inventory_hostname }}-dump.json"
         flat: yes
         fail_on_missing: no
+      when: ansible_connection == "aws_ssm"
 
+    # xxx not sure how to make ansible 'synchronize' work with tsh
+    - name: Fetch file from remote hosts
+      shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json"  "./result/{{ inventory_hostname }}-dump.json"
+      delegate_to: localhost
+      when: ansible_connection != "aws_ssm"
+
+    - name: remove remote dumps
+      ansible.builtin.file:
+        path: "/tmp/{{ inventory_hostname }}-dump.json"
+        state: absent
diff --git a/scripts/sk_collect_dumps/ssh.cfg b/scripts/sk_collect_dumps/ssh.cfg
new file mode 100644
index 0000000000..827c5d9286
--- /dev/null
+++ b/scripts/sk_collect_dumps/ssh.cfg
@@ -0,0 +1,13 @@
+# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh
+
+# Common flags for all teleport.aws.neon.tech hosts
+Host *
+    HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com
+
+# Flags for all teleport.aws.neon.tech hosts except the proxy
+Host * !teleport.aws.neon.tech
+    Port 3022
+    ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p
+    User developer
+
+# End generated Teleport configuration
\ No newline at end of file
diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh
index 2e54ecba1c..5189883fcb 100755
--- a/scripts/sk_collect_dumps/upload.sh
+++ b/scripts/sk_collect_dumps/upload.sh
@@ -31,22 +31,22 @@ SELECT
   (data->>'tenant_id') AS tenant_id,
   (data->>'timeline_id') AS timeline_id,
   (data->'memory'->>'active')::bool AS active,
-  (data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
-  (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
-  (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
-  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
-  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
-  (data->'memory'->>'write_lsn')::bigint AS write_lsn,
+  (data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn,
+  (data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn,
+  (data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn,
+  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn,
+  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn,
+  (data->'memory'->>'write_lsn')::pg_lsn AS write_lsn,
   (data->'memory'->>'num_computes')::bigint AS num_computes,
-  (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
+  (data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn,
   (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
   (data->'memory'->>'is_cancelled')::bool AS is_cancelled,
-  (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
-  (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
+  (data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn,
+  (data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn,
   (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
-  (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
-  (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
-  (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
-  (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
+  (data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn,
+  (data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn,
+  (data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn,
+  (data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn
 FROM tmp_json
 EOF

From ac38d3a88c933f11860f770a39ab984905e01b32 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Dec 2023 17:00:29 +0000
Subject: [PATCH 29/57] remote_storage: don't count 404s as errors (#6201)

## Problem

Currently a chart of S3 error rate is misleading: it can show errors any
time we are attaching a tenant (probing for index_part generation,
checking for remote delete marker).

Considering 404 successful isn't perfectly elegant, but it enables the
error rate to be used a a more meaningful alert signal: it would
indicate if we were having auth issues, sending bad requests, getting
throttled ,etc.

## Summary of changes

Track 404 requests in the AttemptOutcome::Ok bucket instead of the
AttemptOutcome::Err bucket.
---
 libs/remote_storage/src/s3_bucket.rs        | 30 +++++++++++++--------
 test_runner/regress/test_timeline_delete.py |  9 -------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 97fa1bbf5b..d63a5ed99b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -218,14 +218,6 @@ impl S3Bucket {
 
         let started_at = ScopeGuard::into_inner(started_at);
 
-        if get_object.is_err() {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                kind,
-                AttemptOutcome::Err,
-                started_at,
-            );
-        }
-
         match get_object {
             Ok(object_output) => {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
@@ -241,11 +233,27 @@ impl S3Bucket {
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                 Err(DownloadError::NotFound)
             }
-            Err(e) => Err(DownloadError::Other(
-                anyhow::Error::new(e).context("download s3 object"),
-            )),
+            Err(e) => {
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("download s3 object"),
+                ))
+            }
         }
     }
 }
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index c6d578a7a2..82ffcb1177 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -263,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                 ps_http, env.initial_tenant, timeline_id, iterations=iterations
             )
 
-            if failpoint == "timeline-delete-after-index-delete":
-                m = ps_http.get_metrics()
-                assert (
-                    m.query_one(
-                        "remote_storage_s3_request_seconds_count",
-                        filter={"request_type": "get_object", "result": "ok"},
-                    ).value
-                    == 1  # index part for initial timeline
-                )
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
         # this also checks that delete can be retried even when timeline is in Broken state

From 48f156b8a2e1ea69823c355cec4cce86f25676ff Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Dec 2023 20:44:19 +0200
Subject: [PATCH 30/57] feat: relative last activity based eviction (#6136)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new disk usage based eviction option, EvictionOrder, which
selects whether to use the current `AbsoluteAccessed` or this new
proposed but not yet tested `RelativeAccessed`. Additionally a fudge
factor was noticed while implementing this, which might help sparing
smaller tenants at the expense of targeting larger tenants.

Cc: #5304

Co-authored-by: Arpad Müller <arpad@neon.tech>
---
 pageserver/src/config.rs                      |   1 +
 pageserver/src/disk_usage_eviction_task.rs    | 286 ++++++++++++++++--
 pageserver/src/http/routes.rs                 |  15 +-
 .../regress/test_disk_usage_eviction.py       | 116 +++++--
 4 files changed, 363 insertions(+), 55 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index bd63c4d860..8516f397ca 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1468,6 +1468,7 @@ threshold = "20m"
                 period: Duration::from_secs(10),
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
+                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
             })
         );
         match &conf.default_tenant_conf.eviction_policy {
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 76906cfaf7..23b9b573b6 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig {
     pub period: Duration,
     #[cfg(feature = "testing")]
     pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
+/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
+/// partitioning.
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "type", content = "args")]
+pub enum EvictionOrder {
+    /// Order the layers to be evicted by how recently they have been accessed in absolute
+    /// time.
+    ///
+    /// This strategy is unfair when some tenants grow faster than others towards the slower
+    /// growing.
+    #[default]
+    AbsoluteAccessed,
+
+    /// Order the layers to be evicted by how recently they have been accessed relatively within
+    /// the set of resident layers of a tenant.
+    ///
+    /// This strategy will evict layers more fairly but is untested.
+    RelativeAccessed {
+        #[serde(default)]
+        highest_layer_count_loses_first: bool,
+    },
+}
+
+impl EvictionOrder {
+    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
+    /// counts should be the first ones to have their layers evicted.
+    fn highest_layer_count_loses_first(&self) -> bool {
+        match self {
+            EvictionOrder::AbsoluteAccessed => false,
+            EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => *highest_layer_count_loses_first,
+        }
+    }
 }
 
 #[derive(Default)]
@@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
     let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
         .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(
+        state,
+        storage,
+        usage_pre,
+        task_config.eviction_order,
+        cancel,
+    )
+    .await;
     match res {
         Ok(outcome) => {
             debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     state: &State,
     _storage: &GenericRemoteStorage,
     usage_pre: U,
+    eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         "running disk usage based eviction due to pressure"
     );
 
-    let candidates = match collect_eviction_candidates(cancel).await? {
+    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
         EvictionCandidates::Cancelled => {
             return Ok(IterationOutcome::Cancelled);
         }
@@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // Debug-log the list of candidates
     let now = SystemTime::now();
     for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        let nth = i + 1;
         let desc = candidate.layer.layer_desc();
+        let total_candidates = candidates.len();
+        let size = desc.file_size;
+        let rel = candidate.relative_last_activity;
         debug!(
-            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
-            i + 1,
-            candidates.len(),
-            desc.file_size,
+            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
             now.duration_since(candidate.last_activity_ts)
                 .unwrap()
                 .as_micros(),
-            partition,
             desc.tenant_shard_id,
             desc.timeline_id,
             candidate.layer,
@@ -459,6 +506,7 @@ struct EvictionCandidate {
     timeline: Arc<Timeline>,
     layer: Layer,
     last_activity_ts: SystemTime,
+    relative_last_activity: finite_f32::FiniteF32,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -478,24 +526,24 @@ enum EvictionCandidates {
 /// order. A caller that evicts in that order, until pressure is relieved, implements
 /// the eviction policy outlined in the module comment.
 ///
-/// # Example
+/// # Example with EvictionOrder::AbsoluteAccessed
 ///
 /// Imagine that there are two tenants, A and B, with five layers each, a-e.
 /// Each layer has size 100, and both tenant's min_resident_size is 150.
 /// The eviction order would be
 ///
 /// ```text
-/// partition last_activity_ts    tenant/layer
-/// Above     18:30               A/c
-/// Above     19:00               A/b
-/// Above     18:29               B/c
-/// Above     19:05               B/b
-/// Above     20:00               B/a
-/// Above     20:03               A/a
-/// Below     20:30               A/d
-/// Below     20:40               B/d
-/// Below     20:45               B/e
-/// Below     20:58               A/e
+/// partition last_activity_ts tenant/layer
+/// Above     18:30            A/c
+/// Above     19:00            A/b
+/// Above     18:29            B/c
+/// Above     19:05            B/b
+/// Above     20:00            B/a
+/// Above     20:03            A/a
+/// Below     20:30            A/d
+/// Below     20:40            B/d
+/// Below     20:45            B/e
+/// Below     20:58            A/e
 /// ```
 ///
 /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -505,7 +553,77 @@ enum EvictionCandidates {
 /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
+///
+/// # Example with EvictionOrder::RelativeAccessed
+///
+/// ```text
+/// partition relative_age last_activity_ts tenant/layer
+/// Above     0/4          18:30            A/c
+/// Above     0/4          18:29            B/c
+/// Above     1/4          19:00            A/b
+/// Above     1/4          19:05            B/b
+/// Above     2/4          20:00            B/a
+/// Above     2/4          20:03            A/a
+/// Below     3/4          20:30            A/d
+/// Below     3/4          20:40            B/d
+/// Below     4/4          20:45            B/e
+/// Below     4/4          20:58            A/e
+/// ```
+///
+/// With tenants having the same number of layers the picture does not change much. The same with
+/// A having many more layers **resident** (not all of them listed):
+///
+/// ```text
+/// Above       0/100      18:30            A/c
+/// Above       0/4        18:29            B/c
+/// Above       1/100      19:00            A/b
+/// Above       2/100      20:03            A/a
+/// Above       3/100      20:03            A/nth_3
+/// Above       4/100      20:03            A/nth_4
+///             ...
+/// Above       1/4        19:05            B/b
+/// Above      25/100      20:04            A/nth_25
+///             ...
+/// Above       2/4        20:00            B/a
+/// Above      50/100      20:10            A/nth_50
+///             ...
+/// Below       3/4        20:40            B/d
+/// Below      99/100      20:30            A/nth_99
+/// Below       4/4        20:45            B/e
+/// Below     100/100      20:58            A/nth_100
+/// ```
+///
+/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
+/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
+/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
+/// appeared:
+///
+/// ```text
+/// Above       0/87       20:04            A/nth_23
+/// Above       0/3        19:05            B/b
+/// Above       0/50       20:59            C/nth_0
+/// Above       1/87       20:04            A/nth_24
+/// Above       1/50       21:00            C/nth_1
+/// Above       2/87       20:04            A/nth_25
+///             ...
+/// Above      16/50       21:02            C/nth_16
+/// Above       1/3        20:00            B/a
+/// Above      27/87       20:10            A/nth_50
+///             ...
+/// Below       2/3        20:40            B/d
+/// Below      49/50       21:05            C/nth_49
+/// Below      86/87       20:30            A/nth_99
+/// Below       3/3        20:45            B/e
+/// Below      50/50       21:05            C/nth_50
+/// Below      87/87       20:58            A/nth_100
+/// ```
+///
+/// Now relieving pressure with 23 layers would cost:
+/// - tenant A 14 layers
+/// - tenant B 1 layer
+/// - tenant C 8 layers
 async fn collect_eviction_candidates(
+    eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
     // get a snapshot of the list of tenants
@@ -591,12 +709,63 @@ async fn collect_eviction_candidates(
         tenant_candidates
             .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
-        for (timeline, layer_info) in tenant_candidates.into_iter() {
+
+        // keeping the -1 or not decides if every tenant should lose their least recently accessed
+        // layer OR if this should happen in the order of having highest layer count:
+        let fudge = if eviction_order.highest_layer_count_loses_first() {
+            // relative_age vs. tenant layer count:
+            // - 0.1..=1.0 (10 layers)
+            // - 0.01..=1.0 (100 layers)
+            // - 0.001..=1.0 (1000 layers)
+            //
+            // leading to evicting less of the smallest tenants.
+            0
+        } else {
+            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+            // be that less than 10k layer evictions is enough, so we would not need to evict from
+            // all tenants.
+            //
+            // as the tenant ordering is now deterministic this could hit the same tenants
+            // disproportionetly on multiple invocations. alternative could be to remember how many
+            // layers did we evict last time from this tenant, and inject that as an additional
+            // fudge here.
+            1
+        };
+
+        let total = tenant_candidates
+            .len()
+            .checked_sub(fudge)
+            .filter(|&x| x > 0)
+            // support 0 or 1 resident layer tenants as well
+            .unwrap_or(1);
+        let divider = total as f32;
+
+        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
             let file_size = layer_info.file_size();
+
+            // as we iterate this reverse sorted list, the most recently accessed layer will always
+            // be 1.0; this is for us to evict it last.
+            let relative_last_activity = if matches!(
+                eviction_order,
+                EvictionOrder::RelativeAccessed { .. }
+            ) {
+                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
+                // similarly for u16. unsure how it would help.
+                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            } else {
+                finite_f32::FiniteF32::ZERO
+            };
+
             let candidate = EvictionCandidate {
                 timeline,
                 last_activity_ts: layer_info.last_activity_ts,
                 layer: layer_info.layer,
+                relative_last_activity,
             };
             let partition = if cumsum > min_resident_size as i128 {
                 MinResidentSizePartition::Above
@@ -610,8 +779,19 @@ async fn collect_eviction_candidates(
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+
+    match eviction_order {
+        EvictionOrder::AbsoluteAccessed => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.last_activity_ts)
+            });
+        }
+        EvictionOrder::RelativeAccessed { .. } => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            });
+        }
+    }
 
     Ok(EvictionCandidates::Finished(candidates))
 }
@@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey {
     }
 }
 
+/// A totally ordered f32 subset we can use with sorting functions.
+mod finite_f32 {
+
+    /// A totally ordered f32 subset we can use with sorting functions.
+    #[derive(Clone, Copy, PartialEq)]
+    pub struct FiniteF32(f32);
+
+    impl std::fmt::Debug for FiniteF32 {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            std::fmt::Debug::fmt(&self.0, f)
+        }
+    }
+
+    impl std::fmt::Display for FiniteF32 {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            std::fmt::Display::fmt(&self.0, f)
+        }
+    }
+
+    impl std::cmp::Eq for FiniteF32 {}
+
+    impl std::cmp::PartialOrd for FiniteF32 {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    impl std::cmp::Ord for FiniteF32 {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            self.0.total_cmp(&other.0)
+        }
+    }
+
+    impl TryFrom<f32> for FiniteF32 {
+        type Error = f32;
+
+        fn try_from(value: f32) -> Result<Self, Self::Error> {
+            if value.is_finite() {
+                Ok(FiniteF32(value))
+            } else {
+                Err(value)
+            }
+        }
+    }
+
+    impl FiniteF32 {
+        pub const ZERO: FiniteF32 = FiniteF32(0.0);
+
+        pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
+            if (0.0..=1.0).contains(&value) {
+                // -0.0 is within the range, make sure it is assumed 0.0..=1.0
+                let value = value.abs();
+                Ok(FiniteF32(value))
+            } else {
+                Err(value)
+            }
+        }
+    }
+}
+
 mod filesystem_level_usage {
     use anyhow::Context;
     use camino::Utf8Path;
@@ -721,6 +961,7 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
+        use super::EvictionOrder;
         use super::Usage as _;
         use std::time::Duration;
         use utils::serde_percent::Percent;
@@ -732,6 +973,7 @@ mod filesystem_level_usage {
                 period: Duration::MAX,
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
+                eviction_order: EvictionOrder::default(),
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e641e44b08..3ea79ea4f2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1566,19 +1566,22 @@ async fn disk_usage_eviction_run(
     struct Config {
         /// How many bytes to evict before reporting that pressure is relieved.
         evict_bytes: u64,
+
+        #[serde(default)]
+        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
     }
 
     #[derive(Debug, Clone, Copy, serde::Serialize)]
     struct Usage {
         // remains unchanged after instantiation of the struct
-        config: Config,
+        evict_bytes: u64,
         // updated by `add_available_bytes`
         freed_bytes: u64,
     }
 
     impl crate::disk_usage_eviction_task::Usage for Usage {
         fn has_pressure(&self) -> bool {
-            self.config.evict_bytes > self.freed_bytes
+            self.evict_bytes > self.freed_bytes
         }
 
         fn add_available_bytes(&mut self, bytes: u64) {
@@ -1589,7 +1592,7 @@ async fn disk_usage_eviction_run(
     let config = json_request::<Config>(&mut r).await?;
 
     let usage = Usage {
-        config,
+        evict_bytes: config.evict_bytes,
         freed_bytes: 0,
     };
 
@@ -1604,7 +1607,11 @@ async fn disk_usage_eviction_run(
     let state = state.disk_usage_eviction_state.clone();
 
     let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state, storage, usage, &cancel,
+        &state,
+        storage,
+        usage,
+        config.eviction_order,
+        &cancel,
     )
     .await;
 
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index f3f3a1ddf3..9fdc4d59f5 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,6 +1,7 @@
+import enum
 import time
 from dataclasses import dataclass
-from typing import Dict, Tuple
+from typing import Any, Dict, Tuple
 
 import pytest
 import toml
@@ -64,6 +65,23 @@ def test_min_resident_size_override_handling(
     assert_config(tenant_id, None, config_level_override)
 
 
+@enum.unique
+class EvictionOrder(str, enum.Enum):
+    ABSOLUTE_ORDER = "absolute"
+    RELATIVE_ORDER_EQUAL = "relative_equal"
+    RELATIVE_ORDER_SPARE = "relative_spare"
+
+    def config(self) -> Dict[str, Any]:
+        if self == EvictionOrder.ABSOLUTE_ORDER:
+            return {"type": "AbsoluteAccessed"}
+        elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
+            return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}}
+        elif self == EvictionOrder.RELATIVE_ORDER_SPARE:
+            return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}}
+        else:
+            raise RuntimeError(f"not implemented: {self}")
+
+
 @dataclass
 class EvictionEnv:
     timelines: list[Tuple[TenantId, TimelineId]]
@@ -108,13 +126,14 @@ class EvictionEnv:
                     _avg = cur.fetchone()
 
     def pageserver_start_with_disk_usage_eviction(
-        self, period, max_usage_pct, min_avail_bytes, mock_behavior
+        self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder
     ):
         disk_usage_config = {
             "period": period,
             "max_usage_pct": max_usage_pct,
             "min_avail_bytes": min_avail_bytes,
             "mock_statvfs": mock_behavior,
+            "eviction_order": eviction_order.config(),
         }
 
         enc = toml.TomlEncoder()
@@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
-def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_evicts_until_pressure_is_relieved(
+    eviction_env: EvictionEnv, order: EvictionOrder
+):
     """
     Basic test to ensure that we evict enough to relieve pressure.
     """
@@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
 
     target = total_on_disk // 2
 
-    response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = pageserver_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
     assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
 
 
-def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_respects_overridden_resident_size(
+    eviction_env: EvictionEnv, order: EvictionOrder
+):
     """
     Override tenant min resident and ensure that it will be respected by eviction.
     """
@@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
     env.warm_up_tenant(large_tenant[0])
 
     # do one run
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     time.sleep(1)  # give log time to flush
@@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
     assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
 
 
-def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
     """
     If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
     we should continue to evict layers following global LRU.
@@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
     (total_on_disk, _, _) = env.timelines_du()
     target = total_on_disk
 
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
-def test_partial_evict_tenant(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
     """
     Warm up a tenant, then build up pressure to cause in evictions in both.
     We expect
@@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     (total_on_disk, _, _) = env.timelines_du()
     du_by_timeline = env.du_by_timeline()
 
-    # pick any tenant
+    # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
     [warm, cold] = list(du_by_timeline.keys())
     (tenant_id, timeline_id) = warm
 
@@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     # but not enough to fall into global LRU.
     # So, set target to all occupied space, except 2*env.layer_size per tenant
     target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
         ), "all tenants should have lost some layers"
 
     warm_size = later_du_by_timeline[warm]
-
-    # bounds for warmed_size
-    warm_lower = 0.5 * du_by_timeline[warm]
-
-    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-    # So, check for up to 3 here.
-    warm_upper = warm_lower + 3 * env.layer_size
-
     cold_size = later_du_by_timeline[cold]
-    cold_upper = 2 * env.layer_size
 
-    log.info(
-        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
-    )
-    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # bounds for warmed_size
+        warm_lower = 0.5 * du_by_timeline[warm]
 
-    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
-    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+        # So, check for up to 3 here.
+        warm_upper = warm_lower + 3 * env.layer_size
 
-    assert (
-        cold_size < cold_upper
-    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+        cold_upper = 2 * env.layer_size
+        log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
+        log.info(
+            f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+        )
+        log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+        assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+        assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
+        assert (
+            cold_size < cold_upper
+        ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+    else:
+        # just go with the space was freed, find proper limits later
+        pass
 
 
 def poor_mans_du(
@@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
             "type": "Failure",
             "mocked_error": "EIO",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
@@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     def relieved_log_message():
@@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     def relieved_log_message():

From baa1323b4a1d4d38f67101822e1cf20dc38f7ce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Dec 2023 23:38:58 +0100
Subject: [PATCH 31/57] Use ProfileFileCredentialsProvider for AWS SDK
 configuration (#6202)

Allows usage via `aws sso login --profile=<p>; AWS_PROFILE=<p>`. Now
there is no need to manually configure things any more via
`SSO_ACCOUNT_ID` and others. Now one can run the tests locally (given
Neon employee access to aws):

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty REMOTE_STORAGE_S3_REGION=eu-central-1 REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev AWS_PROFILE=dev
cargo test -p remote_storage -j 1 s3 -- --nocapture
```

Also makes the scrubber use the same region for auth that it does its
operations in (not touching the hard coded role name and start_url
values here, they are not ideal though).
---
 libs/remote_storage/src/s3_bucket.rs | 18 ++++++++++++++----
 s3_scrubber/src/lib.rs               |  8 +++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index d63a5ed99b..98be6f0637 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,6 +16,7 @@ use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
     imds::credentials::ImdsCredentialsProvider,
     meta::credentials::CredentialsProviderChain,
+    profile::ProfileFileCredentialsProvider,
     provider_config::ProviderConfig,
     retry::{RetryConfigBuilder, RetryMode},
     web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -74,20 +75,29 @@ impl S3Bucket {
 
         let region = Some(Region::new(aws_config.bucket_region.clone()));
 
+        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
         let credentials_provider = {
             // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
             CredentialsProviderChain::first_try(
                 "env",
                 EnvironmentVariableCredentialsProvider::new(),
             )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
             // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
             // needed to access remote extensions bucket
-            .or_else("token", {
-                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+            .or_else(
+                "token",
                 WebIdentityTokenCredentialsProvider::builder()
                     .configure(&provider_conf)
-                    .build()
-            })
+                    .build(),
+            )
             // uses imds v2
             .or_else("imds", ImdsCredentialsProvider::builder().build())
         };
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d2338c21e5..8fb1346c8e 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -15,6 +15,7 @@ use anyhow::Context;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
 use aws_sdk_s3::config::Region;
@@ -255,6 +256,11 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         let chain = CredentialsProviderChain::first_try(
             "env",
             EnvironmentVariableCredentialsProvider::new(),
+        )
+        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+        .or_else(
+            "profile-sso",
+            ProfileFileCredentialsProvider::builder().build(),
         );
 
         // Use SSO if we were given an account ID
@@ -265,7 +271,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
                     .account_id(sso_account)
                     .role_name("PowerUserAccess")
                     .start_url("https://neondb.awsapps.com/start")
-                    .region(Region::from_static("eu-central-1"))
+                    .region(bucket_region.clone())
                     .build(),
             ),
             None => chain,

From 48890d206e7f3fca54a06f5ab08955a0e2d512f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 12:52:38 +0100
Subject: [PATCH 32/57] Simplify inject_index_part test function (#6207)

Instead of manually constructing the directory's path, we can just use
the `parent()` function.

This is a drive-by improvement from #6206
---
 pageserver/src/tenant/remote_timeline_client.rs | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 52ee8f49ce..1b0cf39fbe 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,15 +2192,6 @@ mod tests {
 
         let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
 
-        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
-        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
-            timeline_path
-                .strip_prefix(&test_state.harness.conf.workdir)
-                .unwrap(),
-        );
-
-        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
-
         let index_path = test_state.harness.remote_fs_dir.join(
             remote_index_path(
                 &test_state.harness.tenant_shard_id,
@@ -2209,6 +2200,10 @@ mod tests {
             )
             .get_path(),
         );
+
+        std::fs::create_dir_all(index_path.parent().unwrap())
+            .expect("creating test dir should work");
+
         eprintln!("Writing {index_path}");
         std::fs::write(&index_path, index_part_bytes).unwrap();
         example_index_part

From 2df3602a4b3fa87fafb589974aa376719171d910 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 21 Dec 2023 12:00:10 +0000
Subject: [PATCH 33/57] Add GC to http connection pool (#6196)

## Problem

HTTP connection pool will grow without being pruned

## Summary of changes

Remove connection clients from pools once idle, or once they exit.
Periodically clear pool shards.

GC Logic:

Each shard contains a hashmap of `Arc<EndpointPool>`s.
Each connection stores a `Weak<EndpointPool>`.

During a GC sweep, we take a random shard write lock, and check that if
any of the `Arc<EndpointPool>`s are unique (using `Arc::get_mut`).
- If they are unique, then we check that the endpoint-pool is empty, and
sweep if it is.
- If they are not unique, then the endpoint-pool is in active use and we
don't sweep.
- Idle connections will self-clear from the endpoint-pool after 5
minutes.

Technically, the uniqueness of the endpoint-pool should be enough to
consider it empty, but the connection count check is done for
completeness sake.
---
 proxy/src/bin/proxy.rs                |  49 +++-
 proxy/src/config.rs                   |   6 +-
 proxy/src/serverless.rs               |   9 +
 proxy/src/serverless/conn_pool.rs     | 366 ++++++++++++++++++--------
 proxy/src/serverless/sql_over_http.rs |   9 +-
 5 files changed, 321 insertions(+), 118 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index be3989d387..5bc2d377a6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -11,6 +11,7 @@ use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
+use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
 use anyhow::bail;
@@ -95,12 +96,8 @@ struct ProxyCliArgs {
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
-    /// timeout for http connections
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    sql_over_http_timeout: tokio::time::Duration,
-    /// Whether the SQL over http pool is opt-in
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    sql_over_http_pool_opt_in: bool,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
     /// timeout for scram authentication protocol
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     scram_protocol_timeout: tokio::time::Duration,
@@ -138,6 +135,36 @@ struct ProxyCliArgs {
     disable_ip_check_for_http: bool,
 }
 
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// timeout for http connection requests
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
+
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20)]
+    sql_over_http_pool_max_conns_per_endpoint: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    /// Duration each shard will wait on average before a GC sweep.
+    /// A longer time will causes sweeps to take longer but will interfere less frequently.
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    sql_over_http_pool_gc_epoch: tokio::time::Duration,
+
+    /// How many shards should the global pool have. Must be a power of two.
+    /// More shards will introduce less contention for pool operations, but can
+    /// increase memory used by the pool
+    #[clap(long, default_value_t = 128)]
+    sql_over_http_pool_shards: usize,
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let _logging_guard = proxy::logging::init().await?;
@@ -327,8 +354,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
     };
     let http_config = HttpConfig {
-        timeout: args.sql_over_http_timeout,
-        pool_opt_in: args.sql_over_http_pool_opt_in,
+        request_timeout: args.sql_over_http.sql_over_http_timeout,
+        pool_options: GlobalConnPoolOptions {
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
+            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
+            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
+        },
     };
     let authentication_config = AuthenticationConfig {
         scram_protocol_timeout: args.scram_protocol_timeout,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2ed248af8d..610bf7e424 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use crate::{auth, rate_limiter::RateBucketInfo};
+use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
 use rustls::{sign, Certificate, PrivateKey};
 use sha2::{Digest, Sha256};
@@ -36,8 +36,8 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
-    pub timeout: tokio::time::Duration,
-    pub pool_opt_in: bool,
+    pub request_timeout: tokio::time::Duration,
+    pub pool_options: GlobalConnPoolOptions,
 }
 
 pub struct AuthenticationConfig {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index e358a0712f..07825da8dc 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,9 +6,13 @@ mod conn_pool;
 mod sql_over_http;
 mod websocket;
 
+pub use conn_pool::GlobalConnPoolOptions;
+
 use anyhow::bail;
 use hyper::StatusCode;
 use metrics::IntCounterPairGuard;
+use rand::rngs::StdRng;
+use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
@@ -47,6 +51,11 @@ pub async fn task_main(
 
     let conn_pool = conn_pool::GlobalConnPool::new(config);
 
+    let conn_pool2 = Arc::clone(&conn_pool);
+    tokio::spawn(async move {
+        conn_pool2.gc_worker(StdRng::from_entropy()).await;
+    });
+
     // shutdown the connection pool
     tokio::spawn({
         let cancellation_token = cancellation_token.clone();
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index df2d1bea32..c476560215 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,15 +1,19 @@
 use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use dashmap::DashMap;
-use futures::future::poll_fn;
+use futures::{future::poll_fn, Future};
+use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
+use once_cell::sync::Lazy;
 use parking_lot::RwLock;
 use pbkdf2::{
     password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
     Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
+use prometheus::{exponential_buckets, register_histogram, Histogram};
+use rand::Rng;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::IpAddr, sync::Arc};
+use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
     task::{ready, Poll},
@@ -18,7 +22,7 @@ use std::{
     ops::Deref,
     sync::atomic::{self, AtomicUsize},
 };
-use tokio::time;
+use tokio::time::{self, Instant};
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 
 use crate::{
@@ -30,11 +34,10 @@ use crate::{
 };
 use crate::{compute, config};
 
-use tracing::{error, warn, Span};
+use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
 pub const APP_NAME: &str = "/sql_over_http";
-const MAX_CONNS_PER_ENDPOINT: usize = 20;
 
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
@@ -69,6 +72,77 @@ struct ConnPoolEntry {
 pub struct EndpointConnPool {
     pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
     total_conns: usize,
+    max_conns: usize,
+    _guard: IntCounterPairGuard,
+}
+
+impl EndpointConnPool {
+    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        pools
+            .get_mut(&db_user)
+            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+    }
+
+    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+        let conn_id = client.conn_id;
+
+        if client.inner.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return Ok(());
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });
+
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();
+
+                    pool.total_conns += 1;
+                }
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
+    }
 }
 
 /// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
@@ -87,6 +161,27 @@ pub struct DbUserConnPool {
     password_hash: Option<PasswordHashString>,
 }
 
+impl DbUserConnPool {
+    fn clear_closed_clients(&mut self, conns: &mut usize) {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.inner.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+    }
+
+    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
+        self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+        }
+        conn
+    }
+}
+
 pub struct GlobalConnPool {
     // endpoint -> per-endpoint connection pool
     //
@@ -94,52 +189,127 @@ pub struct GlobalConnPool {
     // pool as early as possible and release the lock.
     global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
 
+    /// Number of endpoint-connection pools
+    ///
     /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
     global_pool_size: AtomicUsize,
 
+    proxy_config: &'static crate::config::ProxyConfig,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GlobalConnPoolOptions {
     // Maximum number of connections per one endpoint.
     // Can mix different (dbname, username) connections.
     // When running out of free slots for a particular endpoint,
     // falls back to opening a new connection for each request.
-    max_conns_per_endpoint: usize,
+    pub max_conns_per_endpoint: usize,
 
-    proxy_config: &'static crate::config::ProxyConfig,
+    pub gc_epoch: Duration,
 
-    // Using a lock to remove any race conditions.
-    // Eg cleaning up connections while a new connection is returned
-    closed: RwLock<bool>,
+    pub pool_shards: usize,
+
+    pub idle_timeout: Duration,
+
+    pub opt_in: bool,
 }
 
+pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_pool_reclaimation_lag_seconds",
+        "Time it takes to reclaim unused connection pools",
+        // 1us -> 65ms
+        exponential_buckets(1e-6, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "proxy_http_pool_endpoints_registered_total",
+        "Number of endpoints we have registered pools for",
+        "proxy_http_pool_endpoints_unregistered_total",
+        "Number of endpoints we have unregistered pools for",
+    )
+    .unwrap()
+});
+
 impl GlobalConnPool {
     pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
+        let shards = config.http_config.pool_options.pool_shards;
         Arc::new(Self {
-            global_pool: DashMap::new(),
+            global_pool: DashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
-            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
             proxy_config: config,
-            closed: RwLock::new(false),
         })
     }
 
     pub fn shutdown(&self) {
-        *self.closed.write() = true;
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
 
-        self.global_pool.retain(|_, endpoint_pool| {
-            let mut pool = endpoint_pool.write();
-            // by clearing this hashmap, we remove the slots that a connection can be returned to.
-            // when returning, it drops the connection if the slot doesn't exist
-            pool.pools.clear();
-            pool.total_conns = 0;
+    pub async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
 
-            false
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = GC_LATENCY.start_timer();
+        let current_len = shard.len();
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool {
+                    pools, total_conns, ..
+                } = pool.get_mut();
+
+                // ensure that closed clients are removed
+                pools
+                    .iter_mut()
+                    .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
+
+                // we only remove this pool if it has no active connections
+                if *total_conns == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
         });
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe_duration();
+
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
     }
 
     pub async fn get(
         self: &Arc<Self>,
-        conn_info: &ConnInfo,
+        conn_info: ConnInfo,
         force_new: bool,
         session_id: uuid::Uuid,
         peer_addr: IpAddr,
@@ -147,15 +317,11 @@ impl GlobalConnPool {
         let mut client: Option<ClientInner> = None;
         let mut latency_timer = LatencyTimer::new("http");
 
-        let pool = if force_new {
-            None
-        } else {
-            Some((conn_info.clone(), self.clone()))
-        };
-
         let mut hash_valid = false;
+        let mut endpoint_pool = Weak::new();
         if !force_new {
             let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+            endpoint_pool = Arc::downgrade(&pool);
             let mut hash = None;
 
             // find a pool entry by (dbname, username) if exists
@@ -180,12 +346,8 @@ impl GlobalConnPool {
                 // we will continue with the regular connection flow
                 if validate.is_ok() {
                     hash_valid = true;
-                    let mut pool = pool.write();
-                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                        if let Some(entry) = pool_entries.conns.pop() {
-                            client = Some(entry.conn);
-                            pool.total_conns -= 1;
-                        }
+                    if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
+                        client = Some(entry.conn)
                     }
                 }
             }
@@ -198,11 +360,12 @@ impl GlobalConnPool {
                 info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
                 connect_to_compute(
                     self.proxy_config,
-                    conn_info,
+                    &conn_info,
                     conn_id,
                     session_id,
                     latency_timer,
                     peer_addr,
+                    endpoint_pool.clone(),
                 )
                 .await
             } else {
@@ -214,18 +377,19 @@ impl GlobalConnPool {
                 );
                 latency_timer.pool_hit();
                 latency_timer.success();
-                return Ok(Client::new(client, pool).await);
+                return Ok(Client::new(client, conn_info, endpoint_pool).await);
             }
         } else {
             let conn_id = uuid::Uuid::new_v4();
             info!(%conn_id, "pool: opening a new connection '{conn_info}'");
             connect_to_compute(
                 self.proxy_config,
-                conn_info,
+                &conn_info,
                 conn_id,
                 session_id,
                 latency_timer,
                 peer_addr,
+                endpoint_pool.clone(),
             )
             .await
         };
@@ -269,59 +433,7 @@ impl GlobalConnPool {
             _ => {}
         }
         let new_client = new_client?;
-        Ok(Client::new(new_client, pool).await)
-    }
-
-    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
-        let conn_id = client.conn_id;
-
-        // We want to hold this open while we return. This ensures that the pool can't close
-        // while we are in the middle of returning the connection.
-        let closed = self.closed.read();
-        if *closed {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
-            return Ok(());
-        }
-
-        if client.inner.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
-        }
-
-        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < self.max_conns_per_endpoint {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
-
-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
-
-                    pool.total_conns += 1;
-                }
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-
-        Ok(())
+        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
     }
 
     fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
@@ -334,6 +446,12 @@ impl GlobalConnPool {
         let new_pool = Arc::new(RwLock::new(EndpointConnPool {
             pools: HashMap::new(),
             total_conns: 0,
+            max_conns: self
+                .proxy_config
+                .http_config
+                .pool_options
+                .max_conns_per_endpoint,
+            _guard: ENDPOINT_POOLS.guard(),
         }));
 
         // find or create a pool for this endpoint
@@ -363,9 +481,11 @@ impl GlobalConnPool {
 }
 
 struct TokioMechanism<'a> {
+    pool: Weak<RwLock<EndpointConnPool>>,
     conn_info: &'a ConnInfo,
     session_id: uuid::Uuid,
     conn_id: uuid::Uuid,
+    idle: Duration,
 }
 
 #[async_trait]
@@ -385,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
             timeout,
             self.conn_id,
             self.session_id,
+            self.pool.clone(),
+            self.idle,
         )
         .await
     }
@@ -403,6 +525,7 @@ async fn connect_to_compute(
     session_id: uuid::Uuid,
     latency_timer: LatencyTimer,
     peer_addr: IpAddr,
+    pool: Weak<RwLock<EndpointConnPool>>,
 ) -> anyhow::Result<ClientInner> {
     let tls = config.tls_config.as_ref();
     let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -447,6 +570,8 @@ async fn connect_to_compute(
             conn_id,
             conn_info,
             session_id,
+            pool,
+            idle: config.http_config.pool_options.idle_timeout,
         },
         node_info,
         &extra,
@@ -462,6 +587,8 @@ async fn connect_to_compute_once(
     timeout: time::Duration,
     conn_id: uuid::Uuid,
     mut session: uuid::Uuid,
+    pool: Weak<RwLock<EndpointConnPool>>,
+    idle: Duration,
 ) -> Result<ClientInner, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
@@ -490,13 +617,29 @@ async fn connect_to_compute_once(
         branch_id: node_info.aux.branch_id.clone(),
     };
 
+    let db_user = conn_info.db_and_user();
     tokio::spawn(
         async move {
             let _conn_gauge = conn_gauge;
+            let mut idle_timeout = pin!(tokio::time::sleep(idle));
             poll_fn(move |cx| {
                 if matches!(rx.has_changed(), Ok(true)) {
                     session = *rx.borrow_and_update();
                     info!(%session, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+
+                // 5 minute idle connection timeout
+                if idle_timeout.as_mut().poll(cx).is_ready() {
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                    info!("connection idle");
+                    if let Some(pool) = pool.clone().upgrade() {
+                        // remove client from pool - should close the connection if it's idle.
+                        // does nothing if the client is currently checked-out and in-use
+                        if pool.write().remove_client(db_user.clone(), conn_id) {
+                            info!("idle connection removed");
+                        }
+                    }
                 }
 
                 loop {
@@ -514,15 +657,25 @@ async fn connect_to_compute_once(
                         }
                         Some(Err(e)) => {
                             error!(%session, "connection error: {}", e);
-                            return Poll::Ready(())
+                            break
                         }
                         None => {
                             info!("connection closed");
-                            return Poll::Ready(())
+                            break
                         }
                     }
                 }
-            }).await
+
+                // remove from connection pool
+                if let Some(pool) = pool.clone().upgrade() {
+                    if pool.write().remove_client(db_user.clone(), conn_id) {
+                        info!("closed connection removed");
+                    }
+                }
+
+                Poll::Ready(())
+            }).await;
+
         }
         .instrument(span)
     );
@@ -552,23 +705,27 @@ pub struct Client {
     conn_id: uuid::Uuid,
     span: Span,
     inner: Option<ClientInner>,
-    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool>>,
 }
 
 pub struct Discard<'a> {
     conn_id: uuid::Uuid,
-    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool>>,
 }
 
 impl Client {
     pub(self) async fn new(
         inner: ClientInner,
-        pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+        conn_info: ConnInfo,
+        pool: Weak<RwLock<EndpointConnPool>>,
     ) -> Self {
         Self {
             conn_id: inner.conn_id,
             inner: Some(inner),
             span: Span::current(),
+            conn_info,
             pool,
         }
     }
@@ -577,6 +734,7 @@ impl Client {
             inner,
             pool,
             conn_id,
+            conn_info,
             span: _,
         } = self;
         (
@@ -586,6 +744,7 @@ impl Client {
                 .inner,
             Discard {
                 pool,
+                conn_info,
                 conn_id: *conn_id,
             },
         )
@@ -601,14 +760,14 @@ impl Client {
 
 impl Discard<'_> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        if status != ReadyForQueryStatus::Idle {
-            if let Some((conn_info, _)) = self.pool.take() {
-                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
-            }
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
-        if let Some((conn_info, _)) = self.pool.take() {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
             info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
@@ -628,16 +787,17 @@ impl Deref for Client {
 
 impl Drop for Client {
     fn drop(&mut self) {
+        let conn_info = self.conn_info.clone();
         let client = self
             .inner
             .take()
             .expect("client inner should not be removed");
-        if let Some((conn_info, conn_pool)) = self.pool.take() {
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
             let current_span = self.span.clone();
             // return connection to the pool
             tokio::task::spawn_blocking(move || {
                 let _span = current_span.enter();
-                let _ = conn_pool.put(&conn_info, client);
+                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
     }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 307b085ce0..2e9d8526d3 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -206,7 +206,7 @@ pub async fn handle(
     config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
-        config.timeout,
+        config.request_timeout,
         handle_inner(
             config,
             request,
@@ -278,7 +278,7 @@ pub async fn handle(
         Err(_) => {
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.timeout.as_secs()
+                config.request_timeout.as_secs()
             );
             error!(message);
             json_response(
@@ -320,7 +320,8 @@ async fn handle_inner(
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
-    let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool =
+        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 
@@ -359,7 +360,7 @@ async fn handle_inner(
     let payload: Payload = serde_json::from_slice(&body)?;
 
     let mut client = conn_pool
-        .get(&conn_info, !allow_pool, session_id, peer_addr)
+        .get(conn_info, !allow_pool, session_id, peer_addr)
         .await?;
 
     let mut response = Response::builder()

From 5385791ca6e75167b1f8789d0d995332a4c9f512 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 21 Dec 2023 13:07:23 +0100
Subject: [PATCH 34/57] add pageserver component-level benchmark (`pagebench`)
 (#6174)

This PR adds a component-level benchmarking utility for pageserver.
Its name is `pagebench`.

The problem solved by `pagebench` is that we want to put Pageserver
under high load.

This isn't easily achieved with `pgbench` because it needs to go through
a compute, which has signficant performance overhead compared to
accessing Pageserver directly.

Further, compute has its own performance optimizations (most
importantly: caches). Instead of designing a compute-facing workload
that defeats those internal optimizations, `pagebench` simply bypasses
them by accessing pageserver directly.

Supported benchmarks:

* getpage@latest_lsn
* basebackup
* triggering logical size calculation

This code has no automated users yet.
A performance regression test for getpage@latest_lsn will be added in a
later PR.

part of https://github.com/neondatabase/neon/issues/5771
---
 Cargo.lock                                    |  36 ++
 Cargo.toml                                    |   2 +
 libs/pageserver_api/src/shard.rs              |   4 +
 libs/utils/src/lsn.rs                         |  43 +++
 pageserver/client/src/mgmt_api.rs             |   2 +
 pageserver/client/src/mgmt_api/util.rs        |  49 +++
 pageserver/pagebench/Cargo.toml               |  26 ++
 pageserver/pagebench/src/cmd/basebackup.rs    | 272 ++++++++++++++
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 335 ++++++++++++++++++
 .../cmd/trigger_initial_size_calculation.rs   |  85 +++++
 pageserver/pagebench/src/main.rs              |  48 +++
 pageserver/pagebench/src/util/cli/targets.rs  |  34 ++
 pageserver/pagebench/src/util/connstring.rs   |   8 +
 .../pagebench/src/util/request_stats.rs       |  88 +++++
 .../src/util/tokio_thread_local_stats.rs      |  45 +++
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 16 files changed, 1078 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/client/src/mgmt_api/util.rs
 create mode 100644 pageserver/pagebench/Cargo.toml
 create mode 100644 pageserver/pagebench/src/cmd/basebackup.rs
 create mode 100644 pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
 create mode 100644 pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
 create mode 100644 pageserver/pagebench/src/main.rs
 create mode 100644 pageserver/pagebench/src/util/cli/targets.rs
 create mode 100644 pageserver/pagebench/src/util/connstring.rs
 create mode 100644 pageserver/pagebench/src/util/request_stats.rs
 create mode 100644 pageserver/pagebench/src/util/tokio_thread_local_stats.rs

diff --git a/Cargo.lock b/Cargo.lock
index 0e51e88e3b..0be6d5d183 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2106,6 +2106,20 @@ dependencies = [
  "hashbrown 0.13.2",
 ]
 
+[[package]]
+name = "hdrhistogram"
+version = "7.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
+dependencies = [
+ "base64 0.21.1",
+ "byteorder",
+ "crossbeam-channel",
+ "flate2",
+ "nom",
+ "num-traits",
+]
+
 [[package]]
 name = "heapless"
 version = "0.8.0"
@@ -3057,6 +3071,28 @@ dependencies = [
  "sha2",
 ]
 
+[[package]]
+name = "pagebench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "hdrhistogram",
+ "humantime",
+ "humantime-serde",
+ "pageserver",
+ "pageserver_api",
+ "pageserver_client",
+ "rand 0.8.5",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "pagectl"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 6884de7bf5..5de636778a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
     "pageserver",
     "pageserver/ctl",
     "pageserver/client",
+    "pageserver/pagebench",
     "proxy",
     "safekeeper",
     "storage_broker",
@@ -79,6 +80,7 @@ futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
 hashlink = "0.8.1"
+hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3668f7939d..3e4936eec4 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -81,6 +81,10 @@ impl TenantShardId {
     pub fn is_zero(&self) -> bool {
         self.shard_number == ShardNumber(0)
     }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
 }
 
 /// Formatting helper
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 262dcb8a8a..b3269ae049 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -366,6 +366,49 @@ impl MonotonicCounter<Lsn> for RecordLsn {
     }
 }
 
+/// Implements  [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
+///
+/// This is used by the `pagebench` pageserver benchmarking tool.
+pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
+
+impl rand::distributions::uniform::SampleUniform for Lsn {
+    type Sampler = LsnSampler;
+}
+
+impl rand::distributions::uniform::UniformSampler for LsnSampler {
+    type X = Lsn;
+
+    fn new<B1, B2>(low: B1, high: B2) -> Self
+    where
+        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Self(
+            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
+                low.borrow().0,
+                high.borrow().0,
+            ),
+        )
+    }
+
+    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
+    where
+        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Self(
+            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
+                low.borrow().0,
+                high.borrow().0,
+            ),
+        )
+    }
+
+    fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+        Lsn(self.0.sample(rng))
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::bin_ser::BeSer;
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 0ad4e1551e..87e4ed8efd 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -5,6 +5,8 @@ use utils::{
     id::{TenantId, TimelineId},
 };
 
+pub mod util;
+
 #[derive(Debug)]
 pub struct Client {
     mgmt_api_endpoint: String,
diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs
new file mode 100644
index 0000000000..048a3bb7cd
--- /dev/null
+++ b/pageserver/client/src/mgmt_api/util.rs
@@ -0,0 +1,49 @@
+//! Helpers to do common higher-level tasks with the [`Client`].
+
+use std::sync::Arc;
+
+use tokio::task::JoinSet;
+use utils::id::{TenantId, TenantTimelineId};
+
+use super::Client;
+
+/// Retrieve a list of all of the pageserver's timelines.
+///
+/// Fails if there are sharded tenants present on the pageserver.
+pub async fn get_pageserver_tenant_timelines_unsharded(
+    api_client: &Arc<Client>,
+) -> anyhow::Result<Vec<TenantTimelineId>> {
+    let mut timelines: Vec<TenantTimelineId> = Vec::new();
+    let mut tenants: Vec<TenantId> = Vec::new();
+    for ti in api_client.list_tenants().await? {
+        if !ti.id.is_unsharded() {
+            anyhow::bail!(
+                "only unsharded tenants are supported at this time: {}",
+                ti.id
+            );
+        }
+        tenants.push(ti.id.tenant_id)
+    }
+    let mut js = JoinSet::new();
+    for tenant_id in tenants {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(api_client);
+            async move {
+                (
+                    tenant_id,
+                    mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
+                )
+            }
+        });
+    }
+    while let Some(res) = js.join_next().await {
+        let (tenant_id, details) = res.unwrap();
+        for timeline_id in details.timelines {
+            timelines.push(TenantTimelineId {
+                tenant_id,
+                timeline_id,
+            });
+        }
+    }
+    Ok(timelines)
+}
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
new file mode 100644
index 0000000000..169d9b7f8e
--- /dev/null
+++ b/pageserver/pagebench/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "pagebench"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+futures.workspace = true
+hdrhistogram.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+rand.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+tracing.workspace = true
+tokio.workspace = true
+
+pageserver = { path = ".." }
+pageserver_client.workspace = true
+pageserver_api.workspace = true
+utils = { path = "../../libs/utils/" }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
new file mode 100644
index 0000000000..85a3e695de
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -0,0 +1,272 @@
+use anyhow::Context;
+use pageserver_client::page_service::BasebackupRequest;
+
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{debug, info, instrument};
+
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::ops::Range;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
+use crate::util::{request_stats, tokio_thread_local_stats};
+
+/// basebackup@LatestLSN
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long, default_value = "1")]
+    num_clients: NonZeroUsize,
+    #[clap(long, default_value = "1.0")]
+    gzip_probability: f64,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    completed_requests: AtomicU64,
+}
+
+impl LiveStats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+struct Target {
+    timeline: TenantTimelineId,
+    lsn_range: Option<Range<Lsn>>,
+}
+
+#[derive(serde::Serialize)]
+struct Output {
+    total: request_stats::Output,
+}
+
+tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
+        main_impl(args, thread_local_stats)
+    })
+}
+
+async fn main_impl(
+    args: Args,
+    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
+) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let timeline = *timeline;
+            // FIXME: this triggers initial logical size calculation
+            // https://github.com/neondatabase/neon/issues/6168
+            let info = mgmt_api_client
+                .timeline_info(timeline.tenant_id, timeline.timeline_id)
+                .await
+                .unwrap();
+            async move {
+                anyhow::Ok(Target {
+                    timeline,
+                    // TODO: support lsn_range != latest LSN
+                    lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
+                })
+            }
+        });
+    }
+    let mut all_targets: Vec<Target> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_targets.push(res.unwrap().unwrap());
+    }
+
+    let live_stats = Arc::new(LiveStats::default());
+
+    let num_client_tasks = timelines.len();
+    let num_live_stats_dump = 1;
+    let num_work_sender_tasks = 1;
+
+    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
+
+    tokio::spawn({
+        let stats = Arc::clone(&live_stats);
+        let start_work_barrier = Arc::clone(&start_work_barrier);
+        async move {
+            start_work_barrier.wait().await;
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                info!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut work_senders = HashMap::new();
+    let mut tasks = Vec::new();
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
+        work_senders.insert(tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&all_work_done_barrier),
+            Arc::clone(&live_stats),
+        )));
+    }
+
+    let work_sender = async move {
+        start_work_barrier.wait().await;
+        loop {
+            let (timeline, work) = {
+                let mut rng = rand::thread_rng();
+                let target = all_targets.choose(&mut rng).unwrap();
+                let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
+                (
+                    target.timeline,
+                    Work {
+                        lsn,
+                        gzip: rng.gen_bool(args.gzip_probability),
+                    },
+                )
+            };
+            let sender = work_senders.get(&timeline).unwrap();
+            // TODO: what if this blocks?
+            sender.send(work).await.ok().unwrap();
+        }
+    };
+
+    if let Some(runtime) = args.runtime {
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
+    } else {
+        work_sender.await;
+        unreachable!("work sender never terminates");
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+
+    let output = Output {
+        total: {
+            let mut agg_stats = request_stats::Stats::new();
+            for stats in all_thread_local_stats.lock().unwrap().iter() {
+                let stats = stats.lock().unwrap();
+                agg_stats.add(&stats);
+            }
+            agg_stats.output()
+        },
+    };
+
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
+    anyhow::Ok(())
+}
+
+#[derive(Copy, Clone)]
+struct Work {
+    lsn: Option<Lsn>,
+    gzip: bool,
+}
+
+#[instrument(skip_all)]
+async fn client(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<Work>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    start_work_barrier.wait().await;
+
+    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
+        &args.page_service_host_port,
+        args.pageserver_jwt.as_deref(),
+    ))
+    .await
+    .unwrap();
+
+    while let Some(Work { lsn, gzip }) = work.recv().await {
+        let start = Instant::now();
+        let copy_out_stream = client
+            .basebackup(&BasebackupRequest {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+                lsn,
+                gzip,
+            })
+            .await
+            .with_context(|| format!("start basebackup for {timeline}"))
+            .unwrap();
+
+        use futures::StreamExt;
+        let size = Arc::new(AtomicUsize::new(0));
+        copy_out_stream
+            .for_each({
+                |r| {
+                    let size = Arc::clone(&size);
+                    async move {
+                        let size = Arc::clone(&size);
+                        size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
+                    }
+                }
+            })
+            .await;
+        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
new file mode 100644
index 0000000000..16d198ab0e
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -0,0 +1,335 @@
+use anyhow::Context;
+use futures::future::join_all;
+use pageserver::pgdatadir_mapping::key_to_rel_block;
+use pageserver::repository;
+use pageserver_api::key::is_rel_block_key;
+use pageserver_client::page_service::RelTagBlockNo;
+
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{info, instrument};
+
+use std::collections::HashMap;
+use std::future::Future;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
+use crate::util::{request_stats, tokio_thread_local_stats};
+
+/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long, default_value = "1")]
+    num_clients: NonZeroUsize,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long)]
+    per_target_rate_limit: Option<usize>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    completed_requests: AtomicU64,
+}
+
+impl LiveStats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[derive(Clone)]
+struct KeyRange {
+    timeline: TenantTimelineId,
+    timeline_lsn: Lsn,
+    start: i128,
+    end: i128,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end - self.start
+    }
+}
+
+#[derive(serde::Serialize)]
+struct Output {
+    total: request_stats::Output,
+}
+
+tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
+        main_impl(args, thread_local_stats)
+    })
+}
+
+async fn main_impl(
+    args: Args,
+    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
+) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(&mgmt_api_client);
+            let timeline = *timeline;
+            async move {
+                let partitioning = mgmt_api_client
+                    .keyspace(timeline.tenant_id, timeline.timeline_id)
+                    .await?;
+                let lsn = partitioning.at_lsn;
+
+                let ranges = partitioning
+                    .keys
+                    .ranges
+                    .iter()
+                    .filter_map(|r| {
+                        let start = r.start;
+                        let end = r.end;
+                        // filter out non-relblock keys
+                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
+                            (true, true) => Some(KeyRange {
+                                timeline,
+                                timeline_lsn: lsn,
+                                start: start.to_i128(),
+                                end: end.to_i128(),
+                            }),
+                            (true, false) | (false, true) => {
+                                unimplemented!("split up range")
+                            }
+                            (false, false) => None,
+                        }
+                    })
+                    .collect::<Vec<_>>();
+
+                anyhow::Ok(ranges)
+            }
+        });
+    }
+    let mut all_ranges: Vec<KeyRange> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_ranges.extend(res.unwrap().unwrap());
+    }
+
+    let live_stats = Arc::new(LiveStats::default());
+
+    let num_client_tasks = timelines.len();
+    let num_live_stats_dump = 1;
+    let num_work_sender_tasks = 1;
+
+    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
+
+    tokio::spawn({
+        let stats = Arc::clone(&live_stats);
+        let start_work_barrier = Arc::clone(&start_work_barrier);
+        async move {
+            start_work_barrier.wait().await;
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                info!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut work_senders = HashMap::new();
+    let mut tasks = Vec::new();
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+        work_senders.insert(tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&all_work_done_barrier),
+            Arc::clone(&live_stats),
+        )));
+    }
+
+    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
+        None => Box::pin(async move {
+            let weights = rand::distributions::weighted::WeightedIndex::new(
+                all_ranges.iter().map(|v| v.len()),
+            )
+            .unwrap();
+
+            start_work_barrier.wait().await;
+
+            loop {
+                let (range, key) = {
+                    let mut rng = rand::thread_rng();
+                    let r = &all_ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = repository::Key::from_i128(key);
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    (r, RelTagBlockNo { rel_tag, block_no })
+                };
+                let sender = work_senders.get(&range.timeline).unwrap();
+                // TODO: what if this blocks?
+                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+            }
+        }),
+        Some(rps_limit) => Box::pin(async move {
+            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
+
+            let make_timeline_task: &dyn Fn(
+                TenantTimelineId,
+            )
+                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                let sender = work_senders.get(&timeline).unwrap();
+                let ranges: Vec<KeyRange> = all_ranges
+                    .iter()
+                    .filter(|r| r.timeline == timeline)
+                    .cloned()
+                    .collect();
+                let weights = rand::distributions::weighted::WeightedIndex::new(
+                    ranges.iter().map(|v| v.len()),
+                )
+                .unwrap();
+
+                Box::pin(async move {
+                    let mut ticker = tokio::time::interval(period);
+                    ticker.set_missed_tick_behavior(
+                        /* TODO review this choice */
+                        tokio::time::MissedTickBehavior::Burst,
+                    );
+                    loop {
+                        ticker.tick().await;
+                        let (range, key) = {
+                            let mut rng = rand::thread_rng();
+                            let r = &ranges[weights.sample(&mut rng)];
+                            let key: i128 = rng.gen_range(r.start..r.end);
+                            let key = repository::Key::from_i128(key);
+                            let (rel_tag, block_no) = key_to_rel_block(key)
+                                .expect("we filter non-rel-block keys out above");
+                            (r, RelTagBlockNo { rel_tag, block_no })
+                        };
+                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                    }
+                })
+            };
+
+            let tasks: Vec<_> = work_senders
+                .keys()
+                .map(|tl| make_timeline_task(**tl))
+                .collect();
+
+            start_work_barrier.wait().await;
+
+            join_all(tasks).await;
+        }),
+    };
+
+    if let Some(runtime) = args.runtime {
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
+    } else {
+        work_sender.await;
+        unreachable!("work sender never terminates");
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+
+    let output = Output {
+        total: {
+            let mut agg_stats = request_stats::Stats::new();
+            for stats in all_thread_local_stats.lock().unwrap().iter() {
+                let stats = stats.lock().unwrap();
+                agg_stats.add(&stats);
+            }
+            agg_stats.output()
+        },
+    };
+
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
+    anyhow::Ok(())
+}
+
+#[instrument(skip_all)]
+async fn client(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    start_work_barrier.wait().await;
+
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(timeline.tenant_id, timeline.timeline_id)
+        .await
+        .unwrap();
+
+    while let Some((key, lsn)) = work.recv().await {
+        let start = Instant::now();
+        client
+            .getpage(key, lsn)
+            .await
+            .with_context(|| format!("getpage for {timeline}"))
+            .unwrap();
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
new file mode 100644
index 0000000000..d46ae94e8a
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -0,0 +1,85 @@
+use std::sync::Arc;
+
+use humantime::Duration;
+use tokio::task::JoinSet;
+use utils::id::TenantTimelineId;
+
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(
+        long,
+        help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
+    )]
+    poll_for_completion: Option<Duration>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    // kick it off
+
+    let mut js = JoinSet::new();
+    for tl in timelines {
+        let mgmt_api_client = Arc::clone(&mgmt_api_client);
+        js.spawn(async move {
+            // TODO: API to explicitly trigger initial logical size computation.
+            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
+            // => https://github.com/neondatabase/neon/issues/6168
+            let info = mgmt_api_client
+                .timeline_info(tl.tenant_id, tl.timeline_id)
+                .await
+                .unwrap();
+
+            if let Some(period) = args.poll_for_completion {
+                let mut ticker = tokio::time::interval(period.into());
+                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+                let mut info = info;
+                while !info.current_logical_size_is_accurate {
+                    ticker.tick().await;
+                    info = mgmt_api_client
+                        .timeline_info(tl.tenant_id, tl.timeline_id)
+                        .await
+                        .unwrap();
+                }
+            }
+        });
+    }
+    while let Some(res) = js.join_next().await {
+        let _: () = res.unwrap();
+    }
+    Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
new file mode 100644
index 0000000000..e0120c9212
--- /dev/null
+++ b/pageserver/pagebench/src/main.rs
@@ -0,0 +1,48 @@
+use clap::Parser;
+use utils::logging;
+
+/// Re-usable pieces of code that aren't CLI-specific.
+mod util {
+    pub(crate) mod connstring;
+    pub(crate) mod request_stats;
+    #[macro_use]
+    pub(crate) mod tokio_thread_local_stats;
+    /// Re-usable pieces of CLI-specific code.
+    pub(crate) mod cli {
+        pub(crate) mod targets;
+    }
+}
+
+/// The pagebench CLI sub-commands, dispatched in [`main`] below.
+mod cmd {
+    pub(super) mod basebackup;
+    pub(super) mod getpage_latest_lsn;
+    pub(super) mod trigger_initial_size_calculation;
+}
+
+/// Component-level performance test for pageserver.
+#[derive(clap::Parser)]
+enum Args {
+    Basebackup(cmd::basebackup::Args),
+    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
+    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+}
+
+fn main() {
+    logging::init(
+        logging::LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stderr,
+    )
+    .unwrap();
+
+    let args = Args::parse();
+    match args {
+        Args::Basebackup(args) => cmd::basebackup::main(args),
+        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Args::TriggerInitialSizeCalculation(args) => {
+            cmd::trigger_initial_size_calculation::main(args)
+        }
+    }
+    .unwrap()
+}
diff --git a/pageserver/pagebench/src/util/cli/targets.rs b/pageserver/pagebench/src/util/cli/targets.rs
new file mode 100644
index 0000000000..848eae27cf
--- /dev/null
+++ b/pageserver/pagebench/src/util/cli/targets.rs
@@ -0,0 +1,34 @@
+use std::sync::Arc;
+
+use pageserver_client::mgmt_api;
+use tracing::info;
+use utils::id::TenantTimelineId;
+
+pub(crate) struct Spec {
+    pub(crate) limit_to_first_n_targets: Option<usize>,
+    pub(crate) targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) async fn discover(
+    api_client: &Arc<mgmt_api::Client>,
+    spec: Spec,
+) -> anyhow::Result<Vec<TenantTimelineId>> {
+    let mut timelines = if let Some(targets) = spec.targets {
+        targets
+    } else {
+        mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
+    };
+
+    if let Some(limit) = spec.limit_to_first_n_targets {
+        timelines.sort(); // for determinism
+        timelines.truncate(limit);
+        if timelines.len() < limit {
+            anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
+        }
+    }
+
+    info!("timelines:\n{:?}", timelines);
+    info!("number of timelines:\n{:?}", timelines.len());
+
+    Ok(timelines)
+}
diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs
new file mode 100644
index 0000000000..07a0ff042d
--- /dev/null
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -0,0 +1,8 @@
+pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
+    let colon_and_jwt = if let Some(jwt) = jwt {
+        format!(":{jwt}") // TODO: urlescape
+    } else {
+        String::new()
+    };
+    format!("postgres://postgres{colon_and_jwt}@{host_port}")
+}
diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs
new file mode 100644
index 0000000000..5ecf1cbf24
--- /dev/null
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -0,0 +1,88 @@
+use std::time::Duration;
+
+use anyhow::Context;
+
+pub(crate) struct Stats {
+    latency_histo: hdrhistogram::Histogram<u64>,
+}
+
+impl Stats {
+    pub(crate) fn new() -> Self {
+        Self {
+            // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
+            // which would skew the benchmark results.
+            latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
+        }
+    }
+    pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
+        let micros: u64 = latency
+            .as_micros()
+            .try_into()
+            .context("latency greater than u64")?;
+        self.latency_histo
+            .record(micros)
+            .context("add to histogram")?;
+        Ok(())
+    }
+    pub(crate) fn output(&self) -> Output {
+        let latency_percentiles = std::array::from_fn(|idx| {
+            let micros = self
+                .latency_histo
+                .value_at_percentile(LATENCY_PERCENTILES[idx]);
+            Duration::from_micros(micros)
+        });
+        Output {
+            request_count: self.latency_histo.len(),
+            latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
+            latency_percentiles: LatencyPercentiles {
+                latency_percentiles,
+            },
+        }
+    }
+    pub(crate) fn add(&mut self, other: &Self) {
+        let Self {
+            ref mut latency_histo,
+        } = self;
+        latency_histo.add(&other.latency_histo).unwrap();
+    }
+}
+
+impl Default for Stats {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
+
+struct LatencyPercentiles {
+    latency_percentiles: [Duration; 4],
+}
+
+impl serde::Serialize for LatencyPercentiles {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
+        for p in LATENCY_PERCENTILES {
+            ser.serialize_entry(
+                &format!("p{p}"),
+                &format!(
+                    "{}",
+                    &humantime::format_duration(self.latency_percentiles[0])
+                ),
+            )?;
+        }
+        ser.end()
+    }
+}
+
+#[derive(serde::Serialize)]
+pub(crate) struct Output {
+    request_count: u64,
+    #[serde(with = "humantime_serde")]
+    latency_mean: Duration,
+    latency_percentiles: LatencyPercentiles,
+}
diff --git a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
new file mode 100644
index 0000000000..82526213b6
--- /dev/null
+++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
@@ -0,0 +1,45 @@
+pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
+pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
+
+macro_rules! declare {
+    ($THREAD_LOCAL_NAME:ident: $T:ty) => {
+        thread_local! {
+            pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
+                std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
+            );
+        }
+    };
+}
+
+use std::sync::{Arc, Mutex};
+
+pub(crate) use declare;
+
+macro_rules! main {
+    ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
+        let main_impl = $main_impl;
+        let all = Arc::new(Mutex::new(Vec::new()));
+
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .on_thread_start({
+                let all = Arc::clone(&all);
+                move || {
+                    // pre-initialize the thread local stats by accessesing them
+                    // (some stats like requests_stats::Stats are quite costly to initialize,
+                    //  we don't want to pay that cost during the measurement period)
+                    $THREAD_LOCAL_NAME.with(|stats| {
+                        let stats: Arc<_> = Arc::clone(&*stats.borrow());
+                        all.lock().unwrap().push(stats);
+                    });
+                }
+            })
+            .enable_all()
+            .build()
+            .unwrap();
+
+        let main_task = rt.spawn(main_impl(all));
+        rt.block_on(main_task).unwrap()
+    }};
+}
+
+pub(crate) use main;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b81037ae47..e9884a15f5 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1776,6 +1776,7 @@ pub fn is_inherited_key(key: Key) -> bool {
     key != AUX_FILES_KEY
 }
 
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
@@ -1790,7 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
         _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
-
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }

From f93d15f78124b25e70fb2f61a837c878965a66b6 Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Thu, 21 Dec 2023 13:34:31 +0100
Subject: [PATCH 35/57] add comment to run vacuum for clickbench (#6212)

## Problem

This is a comment only change.
To ensure that our benchmarking results are fair we need to have correct
stats in catalog. Otherwise optimizer chooses seq scan instead of index
only scan for some queries. Added comment to run vacuum after data prep.
---
 test_runner/performance/test_perf_olap.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 1de7e95bbe..1e6e9a0174 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -151,7 +151,9 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
     An OLAP-style ClickHouse benchmark
 
     Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
-    The DB prepared manually in advance
+    The DB prepared manually in advance.
+    Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;`
+    to ensure that Postgres optimizer chooses the same plans as RDS and Aurora.
     """
     explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"
 

From 61b6c4cf3037c17700d12303724fc25de5bbc24c Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Thu, 21 Dec 2023 12:46:51 +0000
Subject: [PATCH 36/57] Build dockerfile from neon repo (#6195)

## Fixing GitHub workflow issue related to build and push images

## Summary of changes
Followup of PR#608[move docker file from build repo to neon to solve
issue some issues

The build started failing because it missed a validation in logic that
determines changes in the docker file
Also, all the dependent jobs were skipped because of the build and push
of the image job.
To address the above issue following changes were made

- we are adding validation to generate image tag even if it's a merge to
repo.
- All the dependent jobs won't skip even if the build and push image job
is skipped.
- We have moved the logic to generate a tag in the sub-workflow. As the
tag name was necessary to be passed to the sub-workflow it made sense to
abstract that away where it was needed and then store it as an output
variable so that downward dependent jobs could access the value.
- This made the dependency logic easy and we don't need complex
expressions to check the condition on which it will run
- An earlier PR was closed that tried solving a similar problem that has
some feedback and context before creating this PR
https://github.com/neondatabase/neon/pull/6175

## Checklist before requesting a review

- [x] Move the tag generation logic from the main workflow to the
sub-workflow of build and push the image
- [x] Add a condition to generate an image tag for a non-PR-related run
- [x] remove complex if the condition from the job if conditions

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Abhijeet Patil <abhijeet@neon.tech>
---
 .../workflows/build_and_push_docker_image.yml | 102 +++++++++++
 .github/workflows/build_and_test.yml          |  47 +++--
 .../workflows/update_build_tools_image.yml    | 130 ++++++++++++++
 .gitignore                                    |   1 +
 CONTRIBUTING.md                               |  14 ++
 Dockerfile                                    |   2 +-
 Dockerfile.buildtools                         | 165 ++++++++++++++++++
 Dockerfile.compute-node                       |   2 +-
 Dockerfile.compute-tools                      |   2 +-
 9 files changed, 443 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/build_and_push_docker_image.yml
 create mode 100644 .github/workflows/update_build_tools_image.yml
 create mode 100644 Dockerfile.buildtools

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
new file mode 100644
index 0000000000..2bdf4a2066
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,102 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+    outputs:
+      build-tools-tag:
+        description: "tag generated for build tools"
+        value: ${{ jobs.tag.outputs.build-tools-tag }}
+
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+          
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  check-if-build-tools-dockerfile-changed:
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
+            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
+            exit
+          fi
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          
+  kaniko:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+
+  kaniko-arm:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+  manifest:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - tag
+      - kaniko
+      - kaniko-arm
+      - check-if-build-tools-dockerfile-changed
+
+    steps:
+      - name: Create manifest
+        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6cb6d9df02..77f75b7b82 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,7 +44,6 @@ jobs:
 
         exit 1
 
-
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
@@ -74,11 +73,19 @@ jobs:
         shell: bash
         id: build-tag
 
-  check-codestyle-python:
+  build-buildtools-image:
     needs: [ check-permissions ]
+    uses: ./.github/workflows/build_and_push_docker_image.yml
+    with:
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
+    secrets: inherit
+
+  check-codestyle-python:
+    needs: [ check-permissions, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -108,10 +115,10 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -175,10 +182,10 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, tag, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -408,10 +415,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    needs: [ check-permissions, build-neon, tag ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -447,10 +454,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -479,12 +486,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -526,11 +533,10 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests ]
-
+    needs: [ check-permissions, regress-tests, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -694,7 +700,7 @@ jobs:
             }"
 
   neon-image:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -733,6 +739,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -743,7 +750,7 @@ jobs:
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -778,6 +785,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -788,7 +796,7 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -836,6 +844,7 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
new file mode 100644
index 0000000000..88bab797b7
--- /dev/null
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,130 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+    outputs:
+      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
+      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Get source image digest
+        id: next-digest
+        run: |
+          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
+            exit 1
+          fi
+
+          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
+          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Get destination image digest (if already exists)
+        id: prev-digest
+        run: |
+          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
+          if [ -z "${PREV_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
+          else
+            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
+
+            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Tag image
+        run: |
+          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
+
+  rollback-tag-image:
+    needs:  tag-image
+    if: ${{ !success() }}
+
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Restore previous tag if needed
+        run: |
+          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
+          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
+            exit 0
+          fi
+
+          if [ -z "${PREV_DIGEST}" ]; then
+            # I guess we should delete the tag here/untag the image, but crane does not support it
+            # - https://github.com/google/go-containerregistry/issues/999
+
+            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
+
+            exit 0
+          fi
+
+          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
+          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
+            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
+
+            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
+          else
+            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
+          fi
diff --git a/.gitignore b/.gitignore
index c5fc121ac2..3f4495c9e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 test_output/
 .vscode
 .idea
+neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2692684006..b318c295a3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,3 +70,17 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
+
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:
+
+```bash
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 60de9cfa3e..5d5fde4f14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
new file mode 100644
index 0000000000..d3d05b4e20
--- /dev/null
+++ b/Dockerfile.buildtools
@@ -0,0 +1,165 @@
+FROM debian:bullseye-slim
+
+# Add nonroot user
+RUN useradd -ms /bin/bash nonroot -b /home
+SHELL ["/bin/bash", "-c"]
+
+# System deps
+RUN set -e \
+    && apt update \
+    && apt install -y \
+        autoconf \
+        automake \
+        bison \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        flex \
+        git \
+        gnupg \
+        gzip \
+        jq \
+        libcurl4-openssl-dev \
+        libbz2-dev \
+        libffi-dev \
+        liblzma-dev \
+        libncurses5-dev \
+        libncursesw5-dev \
+        libpq-dev \
+        libreadline-dev \
+        libseccomp-dev \
+        libsqlite3-dev \
+        libssl-dev \
+        libstdc++-10-dev \
+        libtool \
+        libxml2-dev \
+        libxmlsec1-dev \
+        libxxhash-dev \
+        lsof \
+        make \
+        netcat \
+        net-tools \
+        openssh-client \
+        parallel \
+        pkg-config \
+        unzip \
+        wget \
+        xz-utils \
+        zlib1g-dev \
+        zstd \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# protobuf-compiler (protoc)
+ENV PROTOC_VERSION 22.2
+RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc
+
+# LLVM
+ENV LLVM_VERSION=17
+RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
+    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && apt update \
+    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
+    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# PostgreSQL 14
+RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
+    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
+    && apt update \
+    && apt install -y postgresql-client-14 \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
+    && unzip -q awscliv2.zip \
+    && ./aws/install \
+    && rm awscliv2.zip
+
+# Mold: A Modern Linker
+ENV MOLD_VERSION v2.1.0
+RUN set -e \
+    && git clone https://github.com/rui314/mold.git \
+    && mkdir mold/build \
+    && cd mold/build \
+    && git checkout ${MOLD_VERSION} \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
+    && cmake --build . -j $(nproc) \
+    && cmake --install . \
+    && cd .. \
+    && rm -rf mold
+
+# LCOV
+# Build lcov from a fork:
+# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
+# And patches from us:
+# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
+RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
+    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
+    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
+    && cd lcov \
+    && make install \
+    && rm -rf ../lcov.tar.gz
+
+# Switch to nonroot user
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Python
+ENV PYTHON_VERSION=3.9.2 \
+    PYENV_ROOT=/home/nonroot/.pyenv \
+    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
+RUN set -e \
+    && cd $HOME \
+    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
+    && chmod +x pyenv-installer \
+    && ./pyenv-installer \
+    && export PYENV_ROOT=/home/nonroot/.pyenv \
+    && export PATH="$PYENV_ROOT/bin:$PATH" \
+    && export PATH="$PYENV_ROOT/shims:$PATH" \
+    && pyenv install ${PYTHON_VERSION} \
+    && pyenv global ${PYTHON_VERSION} \
+    && python --version \
+    && pip install --upgrade pip \
+    && pip --version \
+    && pip install pipenv wheel poetry
+
+# Switch to nonroot user (again)
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Rust
+# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
+ENV RUSTC_VERSION=1.74.0
+ENV RUSTUP_HOME="/home/nonroot/.rustup"
+ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
+	chmod +x rustup-init && \
+	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
+	rm rustup-init && \
+    export PATH="$HOME/.cargo/bin:$PATH" && \
+    . "$HOME/.cargo/env" && \
+    cargo --version && rustup --version && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny && \
+    cargo install cargo-hack && \
+    rm -rf /home/nonroot/.cargo/registry && \
+    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot
+
+# Show versions
+RUN whoami \
+    && python --version \
+    && pip --version \
+    && cargo --version --verbose \
+    && rustup --version --verbose \
+    && rustc --version --verbose \
+    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a23e930c48..8db60ff85f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index 3066e3f7ca..cc305cc556 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 

From 7d6fc3c826827d8bf7dea789e366c43a483884d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 15:23:09 +0100
Subject: [PATCH 37/57] Use pre-generated initdb.tar.zst in
 test_ingest_real_wal (#6206)

This implements the TODO mentioned in the test added by #5892.
---
 pageserver/src/tenant.rs    |  1 +
 pageserver/src/walingest.rs | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d6f1001db..2f2169d194 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3134,6 +3134,7 @@ impl Tenant {
 
     /// For unit tests, make this visible so that other modules can directly create timelines
     #[cfg(test)]
+    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
     pub(crate) async fn bootstrap_timeline_test(
         &self,
         timeline_id: TimelineId,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 16b245c488..1d14214030 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1612,6 +1612,7 @@ impl<'a> WalIngest<'a> {
 mod tests {
     use super::*;
     use crate::tenant::harness::*;
+    use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
     use crate::tenant::Timeline;
     use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
     use postgres_ffi::RELSEG_SIZE;
@@ -2177,21 +2178,25 @@ mod tests {
         let pg_version = 15; // The test data was generated by pg15
         let path = "test_data/sk_wal_segment_from_pgbench";
         let wal_segment_path = format!("{path}/000000010000000000000001.zst");
+        let source_initdb_path = format!("{path}/{INITDB_PATH}");
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
         let endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
+        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
+        let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
+
+        std::fs::create_dir_all(initdb_path.parent().unwrap())
+            .expect("creating test dir should work");
+        std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
+
         // Bootstrap a real timeline. We can't use create_test_timeline because
         // it doesn't create a real checkpoint, and Walingest::new tries to parse
         // the garbage data.
-        //
-        // TODO use the initdb.tar.zst file stored with the test data to avoid
-        //      problems with inconsistent initdb results after pg minor version bumps.
-        let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
-            .unwrap()
-            .load()
-            .await;
         let tline = tenant
-            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
             .await
             .unwrap();
 

From 1dff98be84fb9aa2497ebf0a36b94143ceb4d729 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 21 Dec 2023 14:55:24 +0000
Subject: [PATCH 38/57] CI: fix build-tools image tag for PRs (#6217)

## Problem

Fix build-tools image tag calculation for PRs.
Broken in https://github.com/neondatabase/neon/pull/6195

## Summary of changes
- Use `pinned` tag instead of `$GITHUB_RUN_ID` if there's no changes in
the dockerfile (and we don't build such image)
---
 .../workflows/build_and_push_docker_image.yml | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
index 2bdf4a2066..e401b2f418 100644
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -15,24 +15,6 @@ on:
         value: ${{ jobs.tag.outputs.build-tools-tag }}
 
 jobs:
-  tag:
-    runs-on: ubuntu-latest
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-          
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
   check-if-build-tools-dockerfile-changed:
     runs-on: ubuntu-latest
     outputs:
@@ -51,7 +33,28 @@ jobs:
           fi
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          
+
+  tag:
+    runs-on: ubuntu-latest
+    needs: [ check-if-build-tools-dockerfile-changed ]
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        env:
+          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
   kaniko:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
     needs: [ tag, check-if-build-tools-dockerfile-changed ]

From a21b71977001b7410d68bb1cc2dfa0352061614b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:28:28 +0100
Subject: [PATCH 39/57] Use neon-github-ci-tests S3 bucket for remote_storage
 tests (#6216)

This bucket is already used by the pytests. The current bucket
github-public-dev is more meant for longer living artifacts.

slack thread:
https://neondb.slack.com/archives/C039YKBRZB4/p1703124944669009

Part of https://github.com/neondatabase/cloud/issues/8233 / #6155
---
 .github/workflows/build_and_test.yml    | 2 +-
 .github/workflows/neon_extra_builds.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 77f75b7b82..3091ce6d3a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -345,7 +345,7 @@ jobs:
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
           ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index b1ea5e4f74..c6c2b7386a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -218,7 +218,7 @@ jobs:
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
           cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

From 83000b3824dda8a89e29fea7885a15fbb3f00d90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 18:07:21 +0100
Subject: [PATCH 40/57] buildtools: update protoc and mold (#6222)

These updates aren't very important but I would like to try out the new
process as of #6195
---
 Dockerfile.buildtools | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index d3d05b4e20..77722f173b 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -51,7 +51,7 @@ RUN set -e \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION 22.2
+ENV PROTOC_VERSION 25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
     && unzip -q protoc.zip -d protoc \
     && mv protoc/bin/protoc /usr/local/bin/protoc \
@@ -81,7 +81,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.1.0
+ENV MOLD_VERSION v2.4.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \

From e68ae2888a6baf4efbe683ac889d4deed7fa5f20 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 10:22:22 +0000
Subject: [PATCH 41/57] pageserver: expedite tenant activation on delete
 (#6190)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

During startup, a tenant delete request might have to retry for many
minutes waiting for a tenant to enter Active state.

## Summary of changes

- Refactor delete_tenant into TenantManager: this is not a functional
change, but will avoid merge conflicts with
https://github.com/neondatabase/neon/pull/6105 later
- Add 412 responses to the swagger definition of this endpoint.
- Use Tenant::wait_to_become_active in `TenantManager::delete_tenant`

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/http/openapi_spec.yml      |   6 ++
 pageserver/src/http/routes.rs             |   5 +-
 pageserver/src/tenant/delete.rs           |   3 +
 pageserver/src/tenant/mgr.rs              | 100 ++++++++++++++--------
 test_runner/regress/test_timeline_size.py |  51 +++++++++++
 5 files changed, 129 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b79c5ada9a..1fbca1086f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -159,6 +159,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
+        "412":
+          description: Deletion may not proceed, tenant is not in Active state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"
         "500":
           description: Generic operation error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3ea79ea4f2..11a3a2c872 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -308,6 +308,7 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
             SlotUpsertError(e) => e.into(),
             Other(o) => ApiError::InternalServerError(o),
             e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+            Cancelled => ApiError::ShuttingDown,
         }
     }
 }
@@ -886,7 +887,9 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
+    state
+        .tenant_manager
+        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
             shard = %tenant_shard_id.shard_slug()
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index e8491f26db..b21bad51ba 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError {
     #[error("Timeline {0}")]
     Timeline(#[from] DeleteTimelineError),
 
+    #[error("Cancelled")]
+    Cancelled,
+
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 31d80026f0..62922e8c99 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1091,6 +1091,71 @@ impl TenantManager {
                 .collect(),
         }
     }
+
+    pub(crate) async fn delete_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+        activation_timeout: Duration,
+    ) -> Result<(), DeleteTenantError> {
+        // We acquire a SlotGuard during this function to protect against concurrent
+        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+        // have to return the Tenant to the map while the background deletion runs.
+        //
+        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+        // Currently, deletion requires a reference to the tenants map in order to
+        // keep the Tenant in the map until deletion is complete, and then remove
+        // it at the end.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080
+
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+
+        // unwrap is safe because we used MustExist mode when acquiring
+        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+            TenantSlot::Attached(tenant) => tenant.clone(),
+            _ => {
+                // Express "not attached" as equivalent to "not found"
+                return Err(DeleteTenantError::NotAttached);
+            }
+        };
+
+        match tenant.current_state() {
+            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                // If a tenant is broken or stopping, DeleteTenantFlow can
+                // handle it: broken tenants proceed to delete, stopping tenants
+                // are checked for deletion already in progress.
+            }
+            _ => {
+                tenant
+                    .wait_to_become_active(activation_timeout)
+                    .await
+                    .map_err(|e| match e {
+                        GetActiveTenantError::WillNotBecomeActive(_) => {
+                            DeleteTenantError::InvalidState(tenant.current_state())
+                        }
+                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
+                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
+                        GetActiveTenantError::WaitForActiveTimeout {
+                            latest_state: _latest_state,
+                            wait_time: _wait_time,
+                        } => DeleteTenantError::InvalidState(tenant.current_state()),
+                    })?;
+            }
+        }
+
+        let result = DeleteTenantFlow::run(
+            self.conf,
+            self.resources.remote_storage.clone(),
+            &TENANTS,
+            tenant,
+        )
+        .await;
+
+        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+        slot_guard.revert();
+        result
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1268,41 +1333,6 @@ pub(crate) async fn get_active_tenant_with_timeout(
     Ok(tenant)
 }
 
-pub(crate) async fn delete_tenant(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant_shard_id: TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    // We acquire a SlotGuard during this function to protect against concurrent
-    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-    // have to return the Tenant to the map while the background deletion runs.
-    //
-    // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-    // Currently, deletion requires a reference to the tenants map in order to
-    // keep the Tenant in the map until deletion is complete, and then remove
-    // it at the end.
-    //
-    // See https://github.com/neondatabase/neon/issues/5080
-
-    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
-
-    // unwrap is safe because we used MustExist mode when acquiring
-    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-        TenantSlot::Attached(tenant) => tenant.clone(),
-        _ => {
-            // Express "not attached" as equivalent to "not found"
-            return Err(DeleteTenantError::NotAttached);
-        }
-    };
-
-    let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
-
-    // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-    slot_guard.revert();
-    result
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
     #[error("Tenant {0}")]
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 6e510b2eba..11685d1d48 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import math
 import queue
 import random
@@ -24,6 +25,7 @@ from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
     wait_for_upload_queue_empty,
+    wait_tenant_status_404,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -776,6 +778,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     def get_tenant_states():
         states = {}
+        log.info(f"Tenant ids: {tenant_ids}")
         for tenant_id in tenant_ids:
             tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
             states[tenant_id] = tenant["state"]["slug"]
@@ -872,3 +875,51 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
     assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+
+    # Check that tenant deletion proactively wakes tenants: this is done separately to the main
+    # body of the test because it will disrupt tenant counts
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    wait_until(10, 1, at_least_one_active)
+    delete_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+
+    # Deleting a stuck tenant should prompt it to go active
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        log.info("Starting background delete")
+
+        def delete_tenant():
+            env.pageserver.http_client().tenant_delete(delete_tenant_id)
+
+        background_delete = executor.submit(delete_tenant)
+
+        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+        # on-demand activation was triggered by the tenant deletion
+        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
+
+        def activated_on_demand():
+            assert env.pageserver.log_contains(log_match) is not None
+
+        log.info(f"Waiting for activation message '{log_match}'")
+        try:
+            wait_until(10, 1, activated_on_demand)
+        finally:
+            log.info("Clearing failpoint")
+            pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+        # Deletion should complete successfully now that failpoint is unblocked
+        log.info("Joining background delete")
+        background_delete.result(timeout=10)
+
+        # Poll for deletion to complete
+        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
+        tenant_ids.remove(delete_tenant_id)
+
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes)
+    wait_until(10, 1, all_active)
+    assert len(get_tenant_states()) == n_tenants - 1

From a7342b3897e491de977e5af25bc8c772a5af05b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:13:20 +0100
Subject: [PATCH 42/57] remote_storage: store last_modified and etag in
 Download (#6227)

Store the content of the `last-modified` and `etag` HTTP headers in
`Download`.

This serves both as the first step towards #6199 and as a preparation
for tests in #6155 .
---
 libs/remote_storage/src/azure_blob.rs | 11 +++++++++++
 libs/remote_storage/src/lib.rs        | 11 +++++++++--
 libs/remote_storage/src/local_fs.rs   | 26 ++++++++++++++------------
 libs/remote_storage/src/s3_bucket.rs  |  4 ++++
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 548bde02f6..7ea1103eb2 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -117,6 +117,8 @@ impl AzureBlobStorage {
     ) -> Result<Download, DownloadError> {
         let mut response = builder.into_stream();
 
+        let mut etag = None;
+        let mut last_modified = None;
         let mut metadata = HashMap::new();
         // TODO give proper streaming response instead of buffering into RAM
         // https://github.com/neondatabase/neon/issues/5563
@@ -124,6 +126,13 @@ impl AzureBlobStorage {
         let mut bufs = Vec::new();
         while let Some(part) = response.next().await {
             let part = part.map_err(to_download_error)?;
+            let etag_str: &str = part.blob.properties.etag.as_ref();
+            if etag.is_none() {
+                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+            }
+            if last_modified.is_none() {
+                last_modified = Some(part.blob.properties.last_modified.into());
+            }
             if let Some(blob_meta) = part.blob.metadata {
                 metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
             }
@@ -136,6 +145,8 @@ impl AzureBlobStorage {
         }
         Ok(Download {
             download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+            etag,
+            last_modified,
             metadata: Some(StorageMetadata(metadata)),
         })
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e77c54e1e7..3e408e3119 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -14,7 +14,9 @@ mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
 
-use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
+use std::{
+    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
+};
 
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -207,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }
 
+pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
 pub struct Download {
-    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
+    pub download_stream: DownloadStream,
+    /// The last time the file was modified (`last-modified` HTTP header)
+    pub last_modified: Option<SystemTime>,
+    /// A way to identify this specific version of the resource (`etag` HTTP header)
+    pub etag: Option<String>,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 03b98e5ea2..d1e7d325b9 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs {
                 .map_err(DownloadError::Other)?;
             Ok(Download {
                 metadata,
+                last_modified: None,
+                etag: None,
                 download_stream: Box::pin(source),
             })
         } else {
@@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs {
                 .await
                 .map_err(DownloadError::Other)?;
 
-            Ok(match end_exclusive {
-                Some(end_exclusive) => Download {
-                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(
-                        source.take(end_exclusive - start_inclusive),
-                    )),
-                },
-                None => Download {
-                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(source)),
-                },
+            let download_stream: DownloadStream = match end_exclusive {
+                Some(end_exclusive) => Box::pin(ReaderStream::new(
+                    source.take(end_exclusive - start_inclusive),
+                )),
+                None => Box::pin(ReaderStream::new(source)),
+            };
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream,
             })
         } else {
             Err(DownloadError::NotFound)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 98be6f0637..0f95458ad1 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -231,6 +231,8 @@ impl S3Bucket {
         match get_object {
             Ok(object_output) => {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
+                let etag = object_output.e_tag.clone();
+                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
 
                 let body = object_output.body;
                 let body = ByteStreamAsStream::from(body);
@@ -239,6 +241,8 @@ impl S3Bucket {
 
                 Ok(Download {
                     metadata,
+                    etag,
+                    last_modified,
                     download_stream: Box::pin(body),
                 })
             }

From 572bc060110bf0d81dcc3e6317f12f6417733146 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 22 Dec 2023 20:47:55 +0200
Subject: [PATCH 43/57] Do not copy WAL for lagged slots (#6221)

## Problem

See https://neondb.slack.com/archives/C026T7K2YP9/p1702813041997959

## Summary of changes

Do not take in account invalidated slots when calculate restart_lsn
position for basebackup at page server

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 9361f08ad2..a197f425a6 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -12,6 +12,7 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "access/xlogutils.h"
 #include "access/xloginsert.h"
@@ -51,6 +52,8 @@
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
 								 * message header */
 
+#define MB ((XLogRecPtr)1024 * 1024)
+
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
 char	   *wal_acceptors_list = "";
@@ -214,7 +217,6 @@ backpressure_lag_impl(void)
 		XLogRecPtr	myFlushLsn = GetFlushRecPtr();
 #endif
 		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
-#define MB ((XLogRecPtr)1024 * 1024)
 
 		elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
 			 LSN_FORMAT_ARGS(myFlushLsn),
@@ -1718,12 +1720,15 @@ walprop_pg_after_election(WalProposer *wp)
 		{
 			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
-			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+			if (max_slot_wal_keep_size_mb <= 0 || lrRestartLsn + max_slot_wal_keep_size_mb*MB > wp->truncateLsn)
+			{
+				/*
+				 * start from the beginning of the segment to fetch page headers
+				 * verifed by XLogReader
+				 */
+				lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
+				wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+			}
 		}
 	}
 }

From cdb08f03621c669a2d6b1efaec89083e0840b4ca Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 23 Oct 2023 17:05:41 +0300
Subject: [PATCH 44/57] Introduce NeonWALReader downloading sk -> compute WAL
 on demand.

It is similar to XLogReader, but when either requested segment is missing
locally or requested LSN is before basebackup_lsn NeonWALReader asynchronously
fetches WAL from one of safekeepers.

Patch includes walproposer switch to NeonWALReader, splitting wouldn't make much
sense as it is hard to test otherwise. This finally removes risk of pg_wal
explosion (as well as slow start time) when one safekeeper is lagging, at the
same time allowing to recover it.

In the future reader should also be used by logical walsender for similar
reasons (currently we download the tail on compute start synchronously).

The main test is test_lagging_sk. However, I also run it manually a lot varying
MAX_SEND_SIZE on both sides (on safekeeper and on walproposer), testing various
fragmentations (one side having small buffer, another, both), which brought up
https://github.com/neondatabase/neon/issues/6055

closes https://github.com/neondatabase/neon/issues/1012
---
 pgxn/neon/Makefile           |   1 +
 pgxn/neon/libpqwalproposer.h |  96 +++++
 pgxn/neon/neon_walreader.c   | 731 +++++++++++++++++++++++++++++++++++
 pgxn/neon/neon_walreader.h   |  29 ++
 pgxn/neon/walproposer.c      | 467 +++++++++++++---------
 pgxn/neon/walproposer.h      | 133 +++----
 pgxn/neon/walproposer_pg.c   | 388 +++++++++++++++----
 7 files changed, 1514 insertions(+), 331 deletions(-)
 create mode 100644 pgxn/neon/libpqwalproposer.h
 create mode 100644 pgxn/neon/neon_walreader.c
 create mode 100644 pgxn/neon/neon_walreader.h

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 466e346e46..c6b224a14d 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,7 @@ OBJS = \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
+	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
diff --git a/pgxn/neon/libpqwalproposer.h b/pgxn/neon/libpqwalproposer.h
new file mode 100644
index 0000000000..cd7e568a47
--- /dev/null
+++ b/pgxn/neon/libpqwalproposer.h
@@ -0,0 +1,96 @@
+/*
+ * Interface to set of libpq wrappers walproposer and neon_walreader need.
+ * Similar to libpqwalreceiver, but it has blocking connection establishment and
+ * pqexec which don't fit us. Implementation is at walproposer_pg.c.
+ */
+#ifndef ___LIBPQWALPROPOSER_H__
+#define ___LIBPQWALPROPOSER_H__
+
+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+
+	/*
+	 * Any success result other than a single CopyBoth was received. The
+	 * specifics of the result were already logged, but it may be useful to
+	 * provide an error message indicating which safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set.
+	 */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+
+	/*
+	 * No result available at this time. Wait until read-ready, then call
+	 * again. Internally, this is returned when PQisBusy indicates that
+	 * PQgetResult would block.
+	 */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
+/* Possible return values from walprop_async_read */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+
+	/*
+	 * The read is ongoing. Wait until the connection is read-ready, then try
+	 * again.
+	 */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from walprop_async_write */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+
+	/*
+	 * The write started, but you'll need to call PQflush some more times to
+	 * finish it off. We just tried, so it's best to wait until the connection
+	 * is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
+/*
+ * This header is included by walproposer.h to define walproposer_api; if we're
+ * building walproposer without pg, ignore libpq part, leaving only interface
+ * types.
+ */
+#ifndef WALPROPOSER_LIB
+
+#include "libpq-fe.h"
+
+/*
+ * Sometimes working directly with underlying PGconn is simpler, export the
+ * whole thing for simplicity.
+ */
+typedef struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received CopyData message from
+								 * walprop_async_read */
+} WalProposerConn;
+
+extern WalProposerConn *libpqwp_connect_start(char *conninfo);
+extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
+extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
+extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
+extern void libpqwp_disconnect(WalProposerConn *conn);
+
+#endif							/* WALPROPOSER_LIB */
+#endif							/* ___LIBPQWALPROPOSER_H__ */
diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
new file mode 100644
index 0000000000..f035c2928f
--- /dev/null
+++ b/pgxn/neon/neon_walreader.c
@@ -0,0 +1,731 @@
+/*
+ * Like WALRead, but when WAL segment doesn't exist locally instead of throwing
+ * ERROR asynchronously tries to fetch it from the most advanced safekeeper.
+ *
+ * We can't use libpqwalreceiver as it blocks during connection establishment
+ * (and waiting for PQExec result), so use libpqwalproposer instead.
+ *
+ * TODO: keepalives are currently never sent, so the other side can close the
+ * connection prematurely.
+ *
+ * TODO: close conn if reading takes too long to prevent stuck connections.
+ */
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "access/xlogdefs.h"
+#include "access/xlogreader.h"
+#include "libpq/pqformat.h"
+#include "storage/fd.h"
+#include "utils/wait_event.h"
+
+#include "libpq-fe.h"
+
+#include "neon_walreader.h"
+#include "walproposer.h"
+
+#define NEON_WALREADER_ERR_MSG_LEN 512
+
+/*
+ * Can be called where NeonWALReader *state is available in the context, adds log_prefix.
+ */
+#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
+
+static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
+static void NeonWALReaderResetRemote(NeonWALReader *state);
+static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
+static void neon_wal_segment_close(NeonWALReader *state);
+static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
+								  TimeLineID tli);
+
+/*
+ * State of connection to donor safekeeper.
+ */
+typedef enum
+{
+	/* no remote connection */
+	RS_NONE,
+	/* doing PQconnectPoll, need readable socket */
+	RS_CONNECTING_READ,
+	/* doing PQconnectPoll, need writable socket */
+	RS_CONNECTING_WRITE,
+	/* Waiting for START_REPLICATION result */
+	RS_WAIT_EXEC_RESULT,
+	/* replication stream established */
+	RS_ESTABLISHED,
+} NeonWALReaderRemoteState;
+
+struct NeonWALReader
+{
+	/*
+	 * LSN before which we assume WAL is not available locally. Exists because
+	 * though first segment after startup always exists, part before
+	 * basebackup LSN is filled with zeros.
+	 */
+	XLogRecPtr	available_lsn;
+	WALSegmentContext segcxt;
+	WALOpenSegment seg;
+	int			wre_errno;
+	/* Explains failure to read, static for simplicity. */
+	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];
+
+	/*
+	 * Saved info about request in progress, used to check validity of
+	 * arguments after resume and remember how far we accomplished it. req_lsn
+	 * is 0 if there is no request in progress.
+	 */
+	XLogRecPtr	req_lsn;
+	Size		req_len;
+	Size		req_progress;
+	WalProposer *wp;			/* we learn donor through walproposer */
+	char		donor_name[64]; /* saved donor safekeeper name for logging */
+	/* state of connection to safekeeper */
+	NeonWALReaderRemoteState rem_state;
+	WalProposerConn *wp_conn;
+
+	/*
+	 * position in wp_conn recvbuf from which we'll copy WAL next time, or
+	 * NULL if there is no unprocessed message
+	 */
+	char	   *wal_ptr;
+	Size		wal_rem_len;	/* how many unprocessed bytes left in recvbuf */
+
+	/*
+	 * LSN of wal_ptr position according to walsender to cross check against
+	 * read request
+	 */
+	XLogRecPtr	rem_lsn;
+
+	/* prepended to lines logged by neon_walreader, if provided */
+	char		log_prefix[64];
+};
+
+/* palloc and initialize NeonWALReader */
+NeonWALReader *
+NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
+{
+	NeonWALReader *reader;
+
+	reader = (NeonWALReader *)
+		palloc_extended(sizeof(NeonWALReader),
+						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
+	if (!reader)
+		return NULL;
+
+	reader->available_lsn = available_lsn;
+	reader->seg.ws_file = -1;
+	reader->seg.ws_segno = 0;
+	reader->seg.ws_tli = 0;
+	reader->segcxt.ws_segsize = wal_segment_size;
+
+	reader->wp = wp;
+
+	reader->rem_state = RS_NONE;
+
+	if (log_prefix)
+		strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
+
+	return reader;
+}
+
+void
+NeonWALReaderFree(NeonWALReader *state)
+{
+	if (state->seg.ws_file != -1)
+		neon_wal_segment_close(state);
+	if (state->wp_conn)
+		libpqwp_disconnect(state->wp_conn);
+	pfree(state);
+}
+
+/*
+ * Like vanilla WALRead, but if requested position is before available_lsn or
+ * WAL segment doesn't exist on disk, it tries to fetch needed segment from the
+ * advanced safekeeper.
+ *
+ * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
+ * fetched from timeline 'tli'.
+ *
+ * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
+ * occurs, in which case 'err' has the desciption. Error always closes remote
+ * connection, if there was any, so socket subscription should be removed.
+ *
+ * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
+ * NeonWALReaderSocket and call NeonWALRead again with exactly the same
+ * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
+ * docs during connection establishment (before first successful read) socket
+ * underneath might change.
+ *
+ * Also, eventually walreader should switch from remote to local read; caller
+ * should remove subscription to socket then by checking NeonWALReaderEvents
+ * after successful read (otherwise next read might reopen the connection with
+ * different socket).
+ *
+ * Reading not monotonically is not supported and will result in error.
+ *
+ * Caller should be sure that WAL up to requested LSN exists, otherwise
+ * NEON_WALREAD_WOULDBLOCK might be always returned.
+ */
+NeonWALReadResult
+NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	/*
+	 * If requested data is before known available basebackup lsn or there is
+	 * already active remote state, do remote read.
+	 */
+	if (startptr < state->available_lsn || state->rem_state != RS_NONE)
+	{
+		return NeonWALReadRemote(state, buf, startptr, count, tli);
+	}
+	if (NeonWALReadLocal(state, buf, startptr, count, tli))
+	{
+		return NEON_WALREAD_SUCCESS;
+	}
+	else if (state->wre_errno == ENOENT)
+	{
+		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr));
+		return NeonWALReadRemote(state, buf, startptr, count, tli);
+	}
+	else
+	{
+		return NEON_WALREAD_ERROR;
+	}
+}
+
+/* Do the read from remote safekeeper. */
+static NeonWALReadResult
+NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	if (state->rem_state == RS_NONE)
+	{
+		XLogRecPtr	donor_lsn;
+
+		/* no connection yet; start one */
+		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
+
+		if (donor == NULL)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "failed to establish remote connection to fetch WAL: no donor available");
+			return NEON_WALREAD_ERROR;
+		}
+		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
+		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
+				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
+		state->wp_conn = libpqwp_connect_start(donor->conninfo);
+		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "failed to connect to %s to fetch WAL: immediately failed with %s",
+					 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+		/* we'll poll immediately */
+		state->rem_state = RS_CONNECTING_READ;
+	}
+
+	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
+	{
+		switch (PQconnectPoll(state->wp_conn->pg_conn))
+		{
+			case PGRES_POLLING_FAILED:
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "failed to connect to %s to fetch WAL: poll error: %s",
+						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+			case PGRES_POLLING_READING:
+				state->rem_state = RS_CONNECTING_READ;
+				return NEON_WALREAD_WOULDBLOCK;
+			case PGRES_POLLING_WRITING:
+				state->rem_state = RS_CONNECTING_WRITE;
+				return NEON_WALREAD_WOULDBLOCK;
+			case PGRES_POLLING_OK:
+				{
+					/* connection successfully established */
+					char		start_repl_query[128];
+
+					snprintf(start_repl_query, sizeof(start_repl_query),
+							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
+							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
+					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
+							state->donor_name, start_repl_query);
+					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "failed to send %s query to %s: %s",
+								 start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+						NeonWALReaderResetRemote(state);
+						return NEON_WALREAD_ERROR;
+					}
+					state->rem_state = RS_WAIT_EXEC_RESULT;
+					break;
+				}
+
+			default:			/* there is unused PGRES_POLLING_ACTIVE */
+				Assert(false);
+				return NEON_WALREAD_ERROR;	/* keep the compiler quiet */
+		}
+	}
+
+	if (state->rem_state == RS_WAIT_EXEC_RESULT)
+	{
+		switch (libpqwp_get_query_result(state->wp_conn))
+		{
+			case WP_EXEC_SUCCESS_COPYBOTH:
+				state->rem_state = RS_ESTABLISHED;
+				break;
+			case WP_EXEC_NEEDS_INPUT:
+				return NEON_WALREAD_WOULDBLOCK;
+			case WP_EXEC_FAILED:
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "get START_REPLICATION result from %s failed: %s",
+						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+			default:			/* can't happen */
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "get START_REPLICATION result from %s: unexpected result",
+						 state->donor_name);
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+		}
+	}
+
+	Assert(state->rem_state == RS_ESTABLISHED);
+
+	/*
+	 * If we had the request before, verify args are the same and advance the
+	 * result ptr according to the progress; otherwise register the request.
+	 */
+	if (state->req_lsn != InvalidXLogRecPtr)
+	{
+		if (state->req_lsn != startptr || state->req_len != count)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "args changed during request, was %X/%X %zu, now %X/%X %zu",
+					 LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+		nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
+				LSN_FORMAT_ARGS(startptr),
+				count,
+				state->req_progress);
+		buf += state->req_progress;
+	}
+	else
+	{
+		state->req_lsn = startptr;
+		state->req_len = count;
+		state->req_progress = 0;
+		nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
+				LSN_FORMAT_ARGS(startptr),
+				count);
+	}
+
+	while (true)
+	{
+		Size		to_copy;
+
+		/*
+		 * If we have no ready data, receive new message.
+		 */
+		if (state->wal_rem_len == 0 &&
+
+		/*
+		 * check for the sake of 0 length reads; walproposer does these for
+		 * heartbeats, though generally they shouldn't hit remote source.
+		 */
+			state->req_len - state->req_progress > 0)
+		{
+			NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
+
+			if (read_msg_res != NEON_WALREAD_SUCCESS)
+				return read_msg_res;
+		}
+
+		if (state->req_lsn + state->req_progress != state->rem_lsn)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
+					 LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
+					 LSN_FORMAT_ARGS(state->rem_lsn),
+					 LSN_FORMAT_ARGS(state->req_lsn),
+					 state->req_len);
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+
+		/* We can copy min of (available, requested) bytes. */
+		to_copy =
+			Min(state->req_len - state->req_progress, state->wal_rem_len);
+		memcpy(buf, state->wal_ptr, to_copy);
+		state->wal_ptr += to_copy;
+		state->wal_rem_len -= to_copy;
+		state->rem_lsn += to_copy;
+		if (state->wal_rem_len == 0)
+			state->wal_ptr = NULL;	/* freed by libpqwalproposer */
+		buf += to_copy;
+		state->req_progress += to_copy;
+		if (state->req_progress == state->req_len)
+		{
+			XLogSegNo	next_segno;
+			XLogSegNo	req_segno;
+
+			XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
+			XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
+
+			/*
+			 * Request completed. If there is a chance of serving next one
+			 * locally, close the connection.
+			 */
+			if (state->req_lsn < state->available_lsn &&
+				state->rem_lsn >= state->available_lsn)
+			{
+				nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
+						LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
+				NeonWALReaderResetRemote(state);
+			}
+			else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
+					 is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
+			{
+				nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
+						LSN_FORMAT_ARGS(state->rem_lsn));
+				NeonWALReaderResetRemote(state);
+			}
+			state->req_lsn = InvalidXLogRecPtr;
+			state->req_len = 0;
+			state->req_progress = 0;
+			return NEON_WALREAD_SUCCESS;
+		}
+	}
+}
+
+/*
+ * Read one WAL message from the stream, sets state->wal_ptr in case of success.
+ * Resets remote state in case of failure.
+ */
+static NeonWALReadResult
+NeonWALReaderReadMsg(NeonWALReader *state)
+{
+	while (true)				/* loop until we get 'w' */
+	{
+		char	   *copydata_ptr;
+		int			copydata_size;
+		StringInfoData s;
+		char		msg_type;
+		int			hdrlen;
+
+		Assert(state->rem_state == RS_ESTABLISHED);
+		Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
+
+		switch (libpqwp_async_read(state->wp_conn,
+								   &copydata_ptr,
+								   &copydata_size))
+		{
+			case PG_ASYNC_READ_SUCCESS:
+				break;
+			case PG_ASYNC_READ_TRY_AGAIN:
+				return NEON_WALREAD_WOULDBLOCK;
+			case PG_ASYNC_READ_FAIL:
+				snprintf(state->err_msg,
+						 sizeof(state->err_msg),
+						 "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
+						 LSN_FORMAT_ARGS(state->req_lsn),
+						 state->req_len,
+						 state->req_progress,
+						 PQerrorMessage(state->wp_conn->pg_conn));
+				goto err;
+		}
+
+		/* put data on StringInfo to parse */
+		s.data = copydata_ptr;
+		s.len = copydata_size;
+		s.cursor = 0;
+		s.maxlen = -1;
+
+		if (copydata_size == 0)
+		{
+			snprintf(state->err_msg,
+					 sizeof(state->err_msg),
+					 "zero length copydata received");
+			goto err;
+		}
+		msg_type = pq_getmsgbyte(&s);
+		switch (msg_type)
+		{
+			case 'w':
+				{
+					XLogRecPtr	start_lsn;
+
+					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
+					if (s.len - s.cursor < hdrlen)
+					{
+						snprintf(state->err_msg,
+								 sizeof(state->err_msg),
+								 "invalid WAL message received from primary");
+						goto err;
+					}
+
+					start_lsn = pq_getmsgint64(&s);
+					pq_getmsgint64(&s); /* XLogRecPtr	end_lsn; */
+					pq_getmsgint64(&s); /* TimestampTz send_time */
+
+					state->rem_lsn = start_lsn;
+					state->wal_rem_len = (Size) (s.len - s.cursor);
+					state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
+					nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
+							LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
+
+					return NEON_WALREAD_SUCCESS;
+				}
+			case 'k':
+				{
+					XLogRecPtr	end_lsn;
+					bool		reply_requested;
+
+					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
+					if (s.len - s.cursor < hdrlen)
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "invalid keepalive message received from primary");
+						goto err;
+					}
+
+					end_lsn = pq_getmsgint64(&s);
+					pq_getmsgint64(&s); /* TimestampTz timestamp; */
+					reply_requested = pq_getmsgbyte(&s);
+					nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
+							LSN_FORMAT_ARGS(end_lsn),
+							reply_requested);
+					if (end_lsn < state->req_lsn + state->req_len)
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
+								 LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
+						goto err;
+					}
+					continue;
+				}
+			default:
+				nwr_log(WARNING, "invalid replication message type %d", msg_type);
+				continue;
+		}
+	}
+err:
+	NeonWALReaderResetRemote(state);
+	return NEON_WALREAD_ERROR;
+}
+
+/* reset remote connection and request in progress */
+static void
+NeonWALReaderResetRemote(NeonWALReader *state)
+{
+	state->req_lsn = InvalidXLogRecPtr;
+	state->req_len = 0;
+	state->req_progress = 0;
+	state->rem_state = RS_NONE;
+	if (state->wp_conn)
+	{
+		libpqwp_disconnect(state->wp_conn);
+		state->wp_conn = NULL;
+	}
+	state->donor_name[0] = '\0';
+	state->wal_ptr = NULL;
+	state->wal_rem_len = 0;
+	state->rem_lsn = InvalidXLogRecPtr;
+}
+
+/*
+ * Return socket of connection to remote source. Must be called only when
+ * connection exists (NeonWALReaderEvents returns non zero).
+ */
+pgsocket
+NeonWALReaderSocket(NeonWALReader *state)
+{
+	if (!state->wp_conn)
+		nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
+	return PQsocket(state->wp_conn->pg_conn);
+}
+
+/*
+ * Returns events user should wait on connection socket or 0 if remote
+ * connection is not active.
+ */
+extern uint32
+NeonWALReaderEvents(NeonWALReader *state)
+{
+	switch (state->rem_state)
+	{
+		case RS_NONE:
+			return 0;
+		case RS_CONNECTING_READ:
+			return WL_SOCKET_READABLE;
+		case RS_CONNECTING_WRITE:
+			return WL_SOCKET_WRITEABLE;
+		case RS_WAIT_EXEC_RESULT:
+		case RS_ESTABLISHED:
+			return WL_SOCKET_READABLE;
+		default:
+			Assert(false);
+			return 0;			/* make compiler happy */
+	}
+}
+
+static bool
+NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+		/*
+		 * If the data we want is not in a segment we have open, close what we
+		 * have (if anything) and open the next one, using the caller's
+		 * provided openSegment callback.
+		 */
+		if (state->seg.ws_file < 0 ||
+			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
+			tli != state->seg.ws_tli)
+		{
+			XLogSegNo	nextSegNo;
+
+			neon_wal_segment_close(state);
+
+			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
+			if (!neon_wal_segment_open(state, nextSegNo, &tli))
+			{
+				char		fname[MAXFNAMELEN];
+
+				state->wre_errno = errno;
+
+				XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
+				snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
+						 fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
+				return false;
+			}
+
+			/* This shouldn't happen -- indicates a bug in segment_open */
+			Assert(state->seg.ws_file >= 0);
+
+			/* Update the current segment info. */
+			state->seg.ws_tli = tli;
+			state->seg.ws_segno = nextSegNo;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (state->segcxt.ws_segsize - startoff))
+			segbytes = state->segcxt.ws_segsize - startoff;
+		else
+			segbytes = nbytes;
+
+#ifndef FRONTEND
+		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+#endif
+
+		/* Reset errno first; eases reporting non-errno-affecting errors */
+		errno = 0;
+		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
+
+#ifndef FRONTEND
+		pgstat_report_wait_end();
+#endif
+
+		if (readbytes <= 0)
+		{
+			char		fname[MAXFNAMELEN];
+
+			XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+
+			if (readbytes < 0)
+			{
+				state->wre_errno = errno;
+				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
+						 fname, startoff, strerror(state->wre_errno));
+			}
+			else
+			{
+				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
+						 fname, startoff);
+			}
+			return false;
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+
+	return true;
+}
+
+/*
+ * Copy of vanilla wal_segment_open, but returns false in case of error instead
+ * of ERROR, with errno set.
+ *
+ * XLogReaderRoutine->segment_open callback for local pg_wal files
+ */
+static bool
+neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
+					  TimeLineID *tli_p)
+{
+	TimeLineID	tli = *tli_p;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
+	nwr_log(DEBUG5, "opening %s", path);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return true;
+
+	return false;
+}
+
+static bool
+is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
+{
+	struct stat stat_buffer;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, segno, segsize);
+	return stat(path, &stat_buffer) == 0;
+}
+
+/* copy of vanilla wal_segment_close with NeonWALReader */
+static void
+neon_wal_segment_close(NeonWALReader *state)
+{
+	if (state->seg.ws_file >= 0)
+	{
+		close(state->seg.ws_file);
+		/* need to check errno? */
+		state->seg.ws_file = -1;
+	}
+}
+
+char *
+NeonWALReaderErrMsg(NeonWALReader *state)
+{
+	return state->err_msg;
+}
diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h
new file mode 100644
index 0000000000..805c94fc53
--- /dev/null
+++ b/pgxn/neon/neon_walreader.h
@@ -0,0 +1,29 @@
+#ifndef __NEON_WALREADER_H__
+#define __NEON_WALREADER_H__
+
+#include "access/xlogdefs.h"
+
+/* forward declare so we don't have to expose the struct to the public */
+struct NeonWALReader;
+typedef struct NeonWALReader NeonWALReader;
+
+/* avoid including walproposer.h as it includes us */
+struct WalProposer;
+typedef struct WalProposer WalProposer;
+
+/* NeonWALRead return value */
+typedef enum
+{
+	NEON_WALREAD_SUCCESS,
+	NEON_WALREAD_WOULDBLOCK,
+	NEON_WALREAD_ERROR,
+} NeonWALReadResult;
+
+extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
+extern void NeonWALReaderFree(NeonWALReader *state);
+extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
+extern uint32 NeonWALReaderEvents(NeonWALReader *state);
+extern char *NeonWALReaderErrMsg(NeonWALReader *state);
+
+#endif							/* __NEON_WALREADER_H__ */
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index fc3332612c..4fb9a46d15 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -45,7 +45,6 @@
 
 /* Prototypes for private functions */
 static void WalProposerLoop(WalProposer *wp);
-static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
 static void ShutdownConnection(Safekeeper *sk);
 static void ResetConnection(Safekeeper *sk);
 static long TimeToReconnect(WalProposer *wp, TimestampTz now);
@@ -78,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
 static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
 static bool AsyncFlush(Safekeeper *sk);
 static int	CompareLsn(const void *a, const void *b);
-static char *FormatSafekeeperState(SafekeeperState state);
+static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
-static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 
+
 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 {
@@ -113,6 +112,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
 		wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
+		wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
 		wp->safekeeper[wp->n_safekeepers].wp = wp;
 
 		{
@@ -127,8 +127,6 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		}
 
 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
-		wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
-		wp->safekeeper[wp->n_safekeepers].flushWrite = false;
 		wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
 		wp->n_safekeepers += 1;
@@ -277,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
+								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -305,58 +303,20 @@ WalProposerLoop(WalProposer *wp)
 		WalProposerPoll(wp);
 }
 
-/*
- * Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
- *
- * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
- */
-static void
-HackyRemoveWalProposerEvent(Safekeeper *to_remove)
-{
-	WalProposer *wp = to_remove->wp;
-
-	/* Remove the existing event set, assign sk->eventPos = -1 */
-	wp->api.free_event_set(wp);
-	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp);
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		uint32		desired_events = WL_NO_EVENTS;
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		if (sk == to_remove)
-			continue;
-
-		/* If this safekeeper isn't offline, add an event for it! */
-		if (sk->state != SS_OFFLINE)
-		{
-			desired_events = SafekeeperStateDesiredEvents(sk->state);
-			/* will set sk->eventPos */
-			wp->api.add_safekeeper_event_set(sk, desired_events);
-		}
-	}
-}
 
 /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
 static void
 ShutdownConnection(Safekeeper *sk)
 {
-	sk->wp->api.conn_finish(sk);
 	sk->state = SS_OFFLINE;
-	sk->flushWrite = false;
 	sk->streamingAt = InvalidXLogRecPtr;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;
 
-	HackyRemoveWalProposerEvent(sk);
+	sk->wp->api.conn_finish(sk);
+	sk->wp->api.rm_safekeeper_event_set(sk);
 }
 
 /*
@@ -474,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
+#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
+#endif
 
 	/*
 	 * Sanity check. We assume further down that the operations don't block
@@ -527,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -556,7 +518,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -622,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	 * Because PQconnectPoll can change the socket, we have to un-register the
 	 * old event and re-register an event on the new socket.
 	 */
-	HackyRemoveWalProposerEvent(sk);
+	wp->api.rm_safekeeper_event_set(sk);
 	wp->api.add_safekeeper_event_set(sk, new_events);
 
 	/* If we successfully connected, send START_WAL_PUSH query */
@@ -1112,6 +1074,9 @@ SendProposerElected(Safekeeper *sk)
 	term_t		lastCommonTerm;
 	int			i;
 
+	/* Now that we are ready to send it's a good moment to create WAL reader */
+	wp->api.wal_reader_allocate(sk);
+
 	/*
 	 * Determine start LSN by comparing safekeeper's log term switch history
 	 * and proposer's, searching for the divergence point.
@@ -1231,6 +1196,7 @@ StartStreaming(Safekeeper *sk)
 	 * once for a connection.
 	 */
 	sk->state = SS_ACTIVE;
+	sk->active_state = SS_ACTIVE_SEND;
 	sk->streamingAt = sk->startStreamingAt;
 
 	/* event set will be updated inside SendMessageToNode */
@@ -1289,9 +1255,13 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 {
 	WalProposer *wp = sk->wp;
 
-	uint32		newEvents = WL_SOCKET_READABLE;
-
-	if (events & WL_SOCKET_WRITEABLE)
+	/*
+	 * Note: we don't known which socket awoke us (sk or nwr). However, as
+	 * SendAppendRequests always tries to send at least one msg in
+	 * SS_ACTIVE_SEND be careful not to go there if are only after sk
+	 * response, otherwise it'd create busy loop of pings.
+	 */
+	if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL)
 		if (!SendAppendRequests(sk))
 			return;
 
@@ -1299,28 +1269,29 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 		if (!RecvAppendResponses(sk))
 			return;
 
-	/*
-	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
-	 * in the buffer.
-	 *
-	 * LSN comparison checks if we have pending unsent messages. This check
-	 * isn't necessary now, because we always send append messages immediately
-	 * after arrival. But it's good to have it here in case we change this
-	 * behavior in the future.
-	 */
-	if (sk->streamingAt != wp->availableLsn || sk->flushWrite)
-		newEvents |= WL_SOCKET_WRITEABLE;
+#if PG_VERSION_NUM >= 150000
+	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
+	if (events & WL_SOCKET_CLOSED)
+	{
+		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+					sk->host, sk->port);
+		ShutdownConnection(sk);
+		return;
+	}
+#endif
 
-	wp->api.update_event_set(sk, newEvents);
+	/* configures event set for yield whatever is the substate */
+	wp->api.active_state_update_event_set(sk);
 }
 
 /*
  * Send WAL messages starting from sk->streamingAt until the end or non-writable
- * socket, whichever comes first. Caller should take care of updating event set.
- * Even if no unsent WAL is available, at least one empty message will be sent
- * as a heartbeat, if socket is ready.
+ * socket or neon_walreader blocks, whichever comes first; active_state is
+ * updated accordingly. Caller should take care of updating event set. Even if
+ * no unsent WAL is available, at least one empty message will be sent as a
+ * heartbeat, if socket is ready.
  *
- * Can change state if Async* functions encounter errors and reset connection.
+ * Resets state and kills the connections if any error on them is encountered.
  * Returns false in this case, true otherwise.
  */
 static bool
@@ -1328,11 +1299,11 @@ SendAppendRequests(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
 	XLogRecPtr	endLsn;
-	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
 	bool		sentAnything = false;
+	AppendRequestHeader *req;
 
-	if (sk->flushWrite)
+	if (sk->active_state == SS_ACTIVE_FLUSH)
 	{
 		if (!AsyncFlush(sk))
 
@@ -1343,76 +1314,101 @@ SendAppendRequests(Safekeeper *sk)
 			return sk->state == SS_ACTIVE;
 
 		/* Event set will be updated in the end of HandleActiveState */
-		sk->flushWrite = false;
+		sk->active_state = SS_ACTIVE_SEND;
 	}
 
 	while (sk->streamingAt != wp->availableLsn || !sentAnything)
 	{
-		sentAnything = true;
-
-		endLsn = sk->streamingAt;
-		endLsn += MAX_SEND_SIZE;
-
-		/* if we went beyond available WAL, back off */
-		if (endLsn > wp->availableLsn)
+		if (sk->active_state == SS_ACTIVE_SEND)
 		{
-			endLsn = wp->availableLsn;
+			sentAnything = true;
+
+			endLsn = sk->streamingAt;
+			endLsn += MAX_SEND_SIZE;
+
+			/* if we went beyond available WAL, back off */
+			if (endLsn > wp->availableLsn)
+			{
+				endLsn = wp->availableLsn;
+			}
+
+			req = &sk->appendRequest;
+			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
+
+			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+
+			resetStringInfo(&sk->outbuf);
+
+			/* write AppendRequest header */
+			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
+			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+			sk->active_state = SS_ACTIVE_READ_WAL;
 		}
 
-		req = &sk->appendRequest;
-		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
-
-		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-					req->endLsn - req->beginLsn,
-					LSN_FORMAT_ARGS(req->beginLsn),
-					LSN_FORMAT_ARGS(req->endLsn),
-					LSN_FORMAT_ARGS(req->commitLsn),
-					LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
-
-		resetStringInfo(&sk->outbuf);
-
-		/* write AppendRequest header */
-		appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
-
-		/* write the WAL itself */
-		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
-		/* wal_read will raise error on failure */
-		wp->api.wal_read(sk,
-						 &sk->outbuf.data[sk->outbuf.len],
-						 req->beginLsn,
-						 req->endLsn - req->beginLsn);
-		sk->outbuf.len += req->endLsn - req->beginLsn;
-
-		writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
-
-		/* Mark current message as sent, whatever the result is */
-		sk->streamingAt = endLsn;
-
-		switch (writeResult)
+		if (sk->active_state == SS_ACTIVE_READ_WAL)
 		{
-			case PG_ASYNC_WRITE_SUCCESS:
-				/* Continue writing the next message */
-				break;
+			char	   *errmsg;
 
-			case PG_ASYNC_WRITE_TRY_FLUSH:
+			req = &sk->appendRequest;
 
-				/*
-				 * * We still need to call PQflush some more to finish the
-				 * job. Caller function will handle this by setting right
-				 * event* set.
-				 */
-				sk->flushWrite = true;
-				return true;
+			switch (wp->api.wal_read(sk,
+									 &sk->outbuf.data[sk->outbuf.len],
+									 req->beginLsn,
+									 req->endLsn - req->beginLsn,
+									 &errmsg))
+			{
+				case NEON_WALREAD_SUCCESS:
+					break;
+				case NEON_WALREAD_WOULDBLOCK:
+					return true;
+				case NEON_WALREAD_ERROR:
+					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
+								sk->host, sk->port, errmsg);
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
+			}
 
-			case PG_ASYNC_WRITE_FAIL:
-				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-							sk->host, sk->port, FormatSafekeeperState(sk->state),
-							wp->api.conn_error_message(sk));
-				ShutdownConnection(sk);
-				return false;
-			default:
-				Assert(false);
-				return false;
+			sk->outbuf.len += req->endLsn - req->beginLsn;
+
+			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
+
+			/* Mark current message as sent, whatever the result is */
+			sk->streamingAt = req->endLsn;
+
+			switch (writeResult)
+			{
+				case PG_ASYNC_WRITE_SUCCESS:
+					/* Continue writing the next message */
+					sk->active_state = SS_ACTIVE_SEND;
+					break;
+
+				case PG_ASYNC_WRITE_TRY_FLUSH:
+
+					/*
+					 * We still need to call PQflush some more to finish the
+					 * job. Caller function will handle this by setting right
+					 * event set.
+					 */
+					sk->active_state = SS_ACTIVE_FLUSH;
+					return true;
+
+				case PG_ASYNC_WRITE_FAIL:
+					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+								sk->host, sk->port, FormatSafekeeperState(sk),
+								wp->api.conn_error_message(sk));
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
+					return false;
+			}
 		}
 	}
 
@@ -1422,7 +1418,7 @@ SendAppendRequests(Safekeeper *sk)
 /*
  * Receive and process all available feedback.
  *
- * Can change state if Async* functions encounter errors and reset connection.
+ * Resets state and kills the connection if any error on it is encountered.
  * Returns false in this case, true otherwise.
  *
  * NB: This function can call SendMessageToNode and produce new messages.
@@ -1608,6 +1604,53 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 	return responses[wp->n_safekeepers - wp->quorum];
 }
 
+/*
+ * Return safekeeper with active connection from which WAL can be downloaded, or
+ * none if it doesn't exist. donor_lsn is set to end position of the donor to
+ * the best of our knowledge.
+ */
+Safekeeper *
+GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
+{
+	*donor_lsn = InvalidXLogRecPtr;
+	Safekeeper *donor = NULL;
+	int			i;
+
+	if (wp->n_votes < wp->quorum)
+	{
+		walprop_log(WARNING, "GetDonor called before elections are won");
+		return NULL;
+	}
+
+	/*
+	 * First, consider node which had determined our term start LSN as we know
+	 * about its position immediately after election before any feedbacks are
+	 * sent.
+	 */
+	if (wp->safekeeper[wp->donor].state >= SS_IDLE)
+	{
+		donor = &wp->safekeeper[wp->donor];
+		*donor_lsn = wp->propEpochStartLsn;
+	}
+
+	/*
+	 * But also check feedbacks from all nodes with live connections and take
+	 * the highest one. Note: if node sends feedbacks it already processed
+	 * elected message so its term is fine.
+	 */
+	for (i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
+		{
+			donor = sk;
+			*donor_lsn = sk->appendResponse.flushLsn;
+		}
+	}
+	return donor;
+}
+
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
@@ -1713,7 +1756,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 
 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk->state),
+						sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
@@ -1753,7 +1796,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk->state));
+					sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1824,12 +1867,13 @@ static bool
 BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
 {
 	WalProposer *wp = sk->wp;
-	uint32		events;
+	uint32		sk_events;
+	uint32		nwr_events;
 
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					sk->host, sk->port, FormatSafekeeperState(sk),
 					wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
@@ -1841,9 +1885,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	 * If the new state will be waiting for events to happen, update the event
 	 * set to wait for those
 	 */
-	events = SafekeeperStateDesiredEvents(success_state);
-	if (events)
-		wp->api.update_event_set(sk, events);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * nwr_events is relevant only during SS_ACTIVE which doesn't use
+	 * BlockingWrite
+	 */
+	Assert(!nwr_events);
+	if (sk_events)
+		wp->api.update_event_set(sk, sk_events);
 
 	return true;
 }
@@ -1876,7 +1926,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						sk->host, sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
@@ -1915,7 +1965,7 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						sk->host, sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
@@ -1945,18 +1995,18 @@ CompareLsn(const void *a, const void *b)
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
  */
 static char *
-FormatSafekeeperState(SafekeeperState state)
+FormatSafekeeperState(Safekeeper *sk)
 {
 	char	   *return_val = NULL;
 
-	switch (state)
+	switch (sk->state)
 	{
 		case SS_OFFLINE:
 			return_val = "offline";
@@ -1984,7 +2034,18 @@ FormatSafekeeperState(SafekeeperState state)
 			return_val = "idle";
 			break;
 		case SS_ACTIVE:
-			return_val = "active";
+			switch (sk->active_state)
+			{
+				case SS_ACTIVE_SEND:
+					return_val = "active send";
+					break;
+				case SS_ACTIVE_READ_WAL:
+					return_val = "active read WAL";
+					break;
+				case SS_ACTIVE_FLUSH:
+					return_val = "active flush";
+					break;
+			}
 			break;
 	}
 
@@ -1997,22 +2058,21 @@ FormatSafekeeperState(SafekeeperState state)
 static void
 AssertEventsOkForState(uint32 events, Safekeeper *sk)
 {
-	WalProposer *wp = sk->wp;
-	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/*
-	 * The events are in-line with what we're expecting, under two conditions:
-	 * (a) if we aren't expecting anything, `events` has no read- or
-	 * write-ready component. (b) if we are expecting something, there's
-	 * overlap (i.e. `events & expected != 0`)
-	 */
+	uint32		sk_events;
+	uint32		nwr_events;
+	uint32		expected;
 	bool		events_ok_for_state;	/* long name so the `Assert` is more
 										 * clear later */
+	WalProposer *wp = sk->wp;
 
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * Without one more level of notify target indirection we have no way to
+	 * distinguish which socket woke up us, so just union expected events.
+	 */
+	expected = sk_events | nwr_events;
+	events_ok_for_state = ((events & expected) != 0);
 
 	if (!events_ok_for_state)
 	{
@@ -2021,36 +2081,39 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
 
-/* Returns the set of events a safekeeper in this state should be waiting on
+/* Returns the set of events for both safekeeper (sk_events) and neon_walreader
+ * (nwr_events) sockets a safekeeper in this state should be waiting on.
  *
  * This will return WL_NO_EVENTS (= 0) for some events. */
-static uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
+void
+SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events)
 {
-	uint32		result = WL_NO_EVENTS;
+	WalProposer *wp = sk->wp;
+
+	*nwr_events = 0;			/* nwr_events is empty for most states */
 
 	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
+	switch (sk->state)
 	{
 			/* Connecting states say what they want in the name */
 		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
+			*sk_events = WL_SOCKET_WRITEABLE;
+			return;
 
 			/* Reading states need the socket to be read-ready to continue */
 		case SS_WAIT_EXEC_RESULT:
 		case SS_HANDSHAKE_RECV:
 		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 
 			/*
 			 * Idle states use read-readiness as a sign that the connection
@@ -2058,32 +2121,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
 			 */
 		case SS_VOTING:
 		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 
-			/*
-			 * Flush states require write-ready for flushing. Active state
-			 * does both reading and writing.
-			 *
-			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
-			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-			 */
 		case SS_SEND_ELECTED_FLUSH:
+			*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			return;
+
 		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
+			switch (sk->active_state)
+			{
+					/*
+					 * Everything is sent; we just wait for sk responses and
+					 * latch.
+					 *
+					 * Note: this assumes we send all available WAL to
+					 * safekeeper in one wakeup (unless it blocks). Otherwise
+					 * we would want WL_SOCKET_WRITEABLE here to finish the
+					 * work.
+					 */
+				case SS_ACTIVE_SEND:
+					*sk_events = WL_SOCKET_READABLE;
+					/* c.f. walprop_pg_active_state_update_event_set */
+#if PG_VERSION_NUM >= 150000
+					if (wp->api.wal_reader_events(sk))
+						*nwr_events = WL_SOCKET_CLOSED;
+#endif							/* on PG 14 nwr_events remains 0 */
+					return;
+
+					/*
+					 * Waiting for neon_walreader socket, but we still read
+					 * responses from sk socket.
+					 */
+				case SS_ACTIVE_READ_WAL:
+					*sk_events = WL_SOCKET_READABLE;
+					*nwr_events = wp->api.wal_reader_events(sk);
+					return;
+
+					/*
+					 * Need to flush the sk socket, so ignore neon_walreader
+					 * one and set write interest on sk.
+					 */
+				case SS_ACTIVE_FLUSH:
+					*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+#if PG_VERSION_NUM >= 150000
+					/* c.f. walprop_pg_active_state_update_event_set */
+					if (wp->api.wal_reader_events(sk))
+						*nwr_events = WL_SOCKET_CLOSED;
+#endif							/* on PG 14 nwr_events remains 0 */
+					return;
+			}
+			return;
 
 			/* The offline state expects no events. */
 		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
+			*sk_events = 0;
+			return;
 
 		default:
 			Assert(false);
-			break;
 	}
-
-	return result;
 }
 
 /* Returns a human-readable string corresponding to the event set
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 6ba2aae75b..a90e87b54f 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -8,6 +8,9 @@
 #include "replication/walreceiver.h"
 #include "utils/uuid.h"
 
+#include "libpqwalproposer.h"
+#include "neon_walreader.h"
+
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
 
@@ -20,43 +23,9 @@
  */
 #define WL_NO_EVENTS 0
 
-struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
+struct WalProposerConn;			/* Defined in libpqwalproposer.h */
 typedef struct WalProposerConn WalProposerConn;
 
-/* Possible return values from ReadPGAsync */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-
-	/*
-	 * The read is ongoing. Wait until the connection is read-ready, then try
-	 * again.
-	 */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from WritePGAsync */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-
-	/*
-	 * The write started, but you'll need to call PQflush some more times to
-	 * finish it off. We just tried, so it's best to wait until the connection
-	 * is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
 /*
  * WAL safekeeper state, which is used to wait for some event.
  *
@@ -133,6 +102,40 @@ typedef enum
 	SS_ACTIVE,
 } SafekeeperState;
 
+/*
+ * Sending WAL substates of SS_ACTIVE.
+ */
+typedef enum
+{
+	/*
+	 * We are ready to send more WAL, waiting for latch set to learn about
+	 * more WAL becoming available (or just a timeout to send heartbeat).
+	 */
+	SS_ACTIVE_SEND,
+
+	/*
+	 * Polling neon_walreader to receive chunk of WAL (probably remotely) to
+	 * send to this safekeeper.
+	 *
+	 * Note: socket management is done completely inside walproposer_pg for
+	 * simplicity, and thus simulation doesn't test it. Which is fine as
+	 * simulation is mainly aimed at consensus checks, not waiteventset
+	 * management.
+	 *
+	 * Also, while in this state we don't touch safekeeper socket, so in
+	 * theory it might close connection as inactive. This can be addressed if
+	 * needed; however, while fetching WAL we should regularly send it, so the
+	 * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
+	 * walreader socket), but similarly shouldn't be a problem.
+	 */
+	SS_ACTIVE_READ_WAL,
+
+	/*
+	 * Waiting for write readiness to flush the socket.
+	 */
+	SS_ACTIVE_FLUSH,
+} SafekeeperActiveState;
+
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
@@ -341,12 +344,11 @@ typedef struct Safekeeper
 	 */
 	XLogRecPtr	startStreamingAt;
 
-	bool		flushWrite;		/* set to true if we need to call AsyncFlush,*
-								 * to flush pending messages */
 	XLogRecPtr	streamingAt;	/* current streaming position */
 	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */
 
 	SafekeeperState state;		/* safekeeper state machine state */
+	SafekeeperActiveState active_state;
 	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
@@ -367,12 +369,17 @@ typedef struct Safekeeper
 	/*
 	 * WAL reader, allocated for each safekeeper.
 	 */
-	XLogReaderState *xlogreader;
+	NeonWALReader *xlogreader;
 
 	/*
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
+
+	/*
+	 * Neon WAL reader position in wait event set, or -1 if no socket.
+	 */
+	int			nwrEventPos;
 #endif
 
 
@@ -401,31 +408,6 @@ typedef enum
 	 */
 } WalProposerConnectPollStatusType;
 
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-
-	/*
-	 * Any success result other than a single CopyBoth was received. The
-	 * specifics of the result were already logged, but it may be useful to
-	 * provide an error message indicating which safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set.
-	 */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-
-	/*
-	 * No result available at this time. Wait until read-ready, then call
-	 * again. Internally, this is returned when PQisBusy indicates that
-	 * PQgetResult would block.
-	 */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
 /* Re-exported ConnStatusType */
 typedef enum
 {
@@ -486,7 +468,7 @@ typedef struct walproposer_api
 	/* Flush buffer to the network, aka PQflush. */
 	int			(*conn_flush) (Safekeeper *sk);
 
-	/* Close the connection, aka PQfinish. */
+	/* Reset sk state: close pq connection, deallocate xlogreader. */
 	void		(*conn_finish) (Safekeeper *sk);
 
 	/*
@@ -506,14 +488,14 @@ typedef struct walproposer_api
 	/* Download WAL from startpos to endpos and make it available locally. */
 	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
 
-	/* Read WAL from disk to buf. */
-	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
-
 	/* Allocate WAL reader. */
 	void		(*wal_reader_allocate) (Safekeeper *sk);
 
-	/* Deallocate event set. */
-	void		(*free_event_set) (WalProposer *wp);
+	/* Read WAL from disk to buf. */
+	NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg);
+
+	/* Returns events to be awaited on WAL reader, if any. */
+	uint32		(*wal_reader_events) (Safekeeper *sk);
 
 	/* Initialize event set. */
 	void		(*init_event_set) (WalProposer *wp);
@@ -521,9 +503,15 @@ typedef struct walproposer_api
 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);
 
+	/* Configure wait event set for yield in SS_ACTIVE. */
+	void		(*active_state_update_event_set) (Safekeeper *sk);
+
 	/* Add a new safekeeper connection to the event set. */
 	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
 
+	/* Remove safekeeper connection from event set */
+	void		(*rm_safekeeper_event_set) (Safekeeper *sk);
+
 	/*
 	 * Wait until some event happens: - timeout is reached - socket event for
 	 * safekeeper connection - new WAL is available
@@ -709,6 +697,13 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);
 
+/*
+ * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
+ * recreate set from scratch, hence the export.
+ */
+extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
+extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
+
 
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index a197f425a6..6199def43f 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -44,10 +44,13 @@
 #include "utils/ps_status.h"
 #include "utils/timestamp.h"
 
-#include "neon.h"
-#include "walproposer.h"
 #include "libpq-fe.h"
 
+#include "libpqwalproposer.h"
+#include "neon.h"
+#include "neon_walreader.h"
+#include "walproposer.h"
+
 #define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
 								 * message header */
@@ -94,6 +97,10 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
 static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);
 
+static void add_nwr_event_set(Safekeeper *sk, uint32 events);
+static void update_nwr_event_set(Safekeeper *sk, uint32 events);
+static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
+
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -543,14 +550,6 @@ walprop_pg_load_libpqwalreceiver(void)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received data from walprop_async_read */
-};
-
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -588,16 +587,17 @@ walprop_status(Safekeeper *sk)
 	}
 }
 
-static void
-walprop_connect_start(Safekeeper *sk)
+WalProposerConn *
+libpqwp_connect_start(char *conninfo)
 {
+
 	PGconn	   *pg_conn;
+	WalProposerConn *conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;
 
-	Assert(sk->conn == NULL);
 
 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
@@ -616,7 +616,7 @@ walprop_connect_start(Safekeeper *sk)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = sk->conninfo;
+	values[n] = conninfo;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -637,11 +637,20 @@ walprop_connect_start(Safekeeper *sk)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	sk->conn = palloc(sizeof(WalProposerConn));
-	sk->conn->pg_conn = pg_conn;
-	sk->conn->is_nonblocking = false;	/* connections always start in
-										 * blocking mode */
-	sk->conn->recvbuf = NULL;
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false;	/* connections always start in blocking
+									 * mode */
+	conn->recvbuf = NULL;
+	return conn;
+}
+
+static void
+walprop_connect_start(Safekeeper *sk)
+{
+	Assert(sk->conn == NULL);
+	sk->conn = libpqwp_connect_start(sk->conninfo);
+
 }
 
 static WalProposerConnectPollStatusType
@@ -685,26 +694,33 @@ walprop_connect_poll(Safekeeper *sk)
 	return return_val;
 }
 
-static bool
-walprop_send_query(Safekeeper *sk, char *query)
+extern bool
+libpqwp_send_query(WalProposerConn *conn, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
 	 * requiring a call to PQflush
 	 */
-	if (!ensure_nonblocking_status(sk->conn, false))
+	if (!ensure_nonblocking_status(conn, false))
 		return false;
 
 	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(sk->conn->pg_conn, query))
+	if (!PQsendQuery(conn->pg_conn, query))
 		return false;
 
 	return true;
 }
 
-static WalProposerExecStatusType
-walprop_get_query_result(Safekeeper *sk)
+static bool
+walprop_send_query(Safekeeper *sk, char *query)
 {
+	return libpqwp_send_query(sk->conn, query);
+}
+
+WalProposerExecStatusType
+libpqwp_get_query_result(WalProposerConn *conn)
+{
+
 	PGresult   *result;
 	WalProposerExecStatusType return_val;
 
@@ -712,14 +728,14 @@ walprop_get_query_result(Safekeeper *sk)
 	char	   *unexpected_success = NULL;
 
 	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(sk->conn->pg_conn))
+	if (!PQconsumeInput(conn->pg_conn))
 		return WP_EXEC_FAILED;
 
-	if (PQisBusy(sk->conn->pg_conn))
+	if (PQisBusy(conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;
 
 
-	result = PQgetResult(sk->conn->pg_conn);
+	result = PQgetResult(conn->pg_conn);
 
 	/*
 	 * PQgetResult returns NULL only if getting the result was successful &
@@ -780,6 +796,12 @@ walprop_get_query_result(Safekeeper *sk)
 	return return_val;
 }
 
+static WalProposerExecStatusType
+walprop_get_query_result(Safekeeper *sk)
+{
+	return libpqwp_get_query_result(sk->conn);
+}
+
 static pgsocket
 walprop_socket(Safekeeper *sk)
 {
@@ -792,38 +814,21 @@ walprop_flush(Safekeeper *sk)
 	return (PQflush(sk->conn->pg_conn));
 }
 
-static void
-walprop_finish(Safekeeper *sk)
+/* Like libpqrcv_receive. *buf is valid until the next call. */
+PGAsyncReadResult
+libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 {
-	if (!sk->conn)
-		return;
 
-	if (sk->conn->recvbuf != NULL)
-		PQfreemem(sk->conn->recvbuf);
-	PQfinish(sk->conn->pg_conn);
-	pfree(sk->conn);
-	sk->conn = NULL;
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-walprop_async_read(Safekeeper *sk, char **buf, int *amount)
-{
 	int			result;
 
-	if (sk->conn->recvbuf != NULL)
+	if (conn->recvbuf != NULL)
 	{
-		PQfreemem(sk->conn->recvbuf);
-		sk->conn->recvbuf = NULL;
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
 	}
 
 	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(sk->conn->pg_conn))
+	if (!PQconsumeInput(conn->pg_conn))
 	{
 		*amount = 0;
 		*buf = NULL;
@@ -841,7 +846,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
+	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
 	{
 		case 0:
 			*amount = 0;
@@ -856,7 +861,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 				 * We can check PQgetResult to make sure that the server
 				 * failed; it'll always result in PGRES_FATAL_ERROR
 				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
+				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
 
 				if (status != PGRES_FATAL_ERROR)
 					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
@@ -877,11 +882,23 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 		default:
 			/* Positive values indicate the size of the returned result */
 			*amount = result;
-			*buf = sk->conn->recvbuf;
+			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }
 
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+walprop_async_read(Safekeeper *sk, char **buf, int *amount)
+{
+	return libpqwp_async_read(sk->conn, buf, amount);
+}
+
 static PGAsyncWriteResult
 walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 {
@@ -964,6 +981,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
 	return true;
 }
 
+void
+libpqwp_disconnect(WalProposerConn *conn)
+{
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+static void
+walprop_finish(Safekeeper *sk)
+{
+	if (sk->conn)
+	{
+		libpqwp_disconnect(sk->conn);
+		sk->conn = NULL;
+	}
+
+	/* free xlogreader */
+	if (sk->xlogreader)
+	{
+		NeonWALReaderFree(sk->xlogreader);
+		sk->xlogreader = NULL;
+	}
+	rm_safekeeper_event_set(sk, false);
+}
+
 /*
  * Subscribe for new WAL and stream it in the loop to safekeepers.
  *
@@ -1402,30 +1446,56 @@ XLogWalPropClose(XLogRecPtr recptr)
 	walpropFile = -1;
 }
 
-static void
-walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
-{
-	WALReadError errinfo;
-
-	if (!WALRead(sk->xlogreader,
-				 buf,
-				 startptr,
-				 count,
-				 walprop_pg_get_timeline_id(),
-				 &errinfo))
-	{
-		WALReadRaiseError(&errinfo);
-	}
-}
-
 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
-	sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
+	char		log_prefix[64];
+
+	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
+	Assert(!sk->xlogreader);
+	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
 		elog(FATAL, "Failed to allocate xlog reader");
 }
 
+static NeonWALReadResult
+walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg)
+{
+	NeonWALReadResult res;
+
+	res = NeonWALRead(sk->xlogreader,
+					  buf,
+					  startptr,
+					  count,
+					  walprop_pg_get_timeline_id());
+
+	if (res == NEON_WALREAD_SUCCESS)
+	{
+		/*
+		 * If we have the socket subscribed, but walreader doesn't need any
+		 * events, it must mean that remote connection just closed hoping to
+		 * do next read locally. Remove the socket then. It is important to do
+		 * as otherwise next read might open another connection and we won't
+		 * be able to distinguish whether we have correct socket added in wait
+		 * event set.
+		 */
+		if (NeonWALReaderEvents(sk->xlogreader) == 0)
+			rm_safekeeper_event_set(sk, false);
+	}
+	else if (res == NEON_WALREAD_ERROR)
+	{
+		*errmsg = NeonWALReaderErrMsg(sk->xlogreader);
+	}
+
+	return res;
+}
+
+static uint32
+walprop_pg_wal_reader_events(Safekeeper *sk)
+{
+	return NeonWALReaderEvents(sk->xlogreader);
+}
+
 static WaitEventSet *waitEvents;
 
 static void
@@ -1440,6 +1510,7 @@ walprop_pg_free_event_set(WalProposer *wp)
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		wp->safekeeper[i].eventPos = -1;
+		wp->safekeeper[i].nwrEventPos = -1;
 	}
 }
 
@@ -1449,11 +1520,35 @@ walprop_pg_init_event_set(WalProposer *wp)
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
+	/* for each sk, we have socket plus potentially socket for neon walreader */
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
+
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		wp->safekeeper[i].eventPos = -1;
+		wp->safekeeper[i].nwrEventPos = -1;
+	}
+}
+
+/* add safekeeper socket to wait event set */
+static void
+walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
+{
+	Assert(sk->eventPos == -1);
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
+}
+
+/* add neon wal reader socket to wait event set */
+static void
+add_nwr_event_set(Safekeeper *sk, uint32 events)
+{
+	Assert(sk->nwrEventPos == -1);
+	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
+	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
 static void
@@ -1465,10 +1560,147 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }
 
+/*
+ * Update neon_walreader event.
+ * Can be called when nwr socket doesn't exist, does nothing in this case.
+ */
 static void
-walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
+update_nwr_event_set(Safekeeper *sk, uint32 events)
 {
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
+	/* eventPos = -1 when we don't have an event */
+	if (sk->nwrEventPos != -1)
+		ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
+}
+
+
+static void
+walprop_pg_active_state_update_event_set(Safekeeper *sk)
+{
+	uint32		sk_events;
+	uint32		nwr_events;
+
+	Assert(sk->state == SS_ACTIVE);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * If we need to wait for neon_walreader, ensure we have up to date socket
+	 * in the wait event set.
+	 */
+	if (sk->active_state == SS_ACTIVE_READ_WAL)
+	{
+		/*
+		 * TODO: instead of reattaching socket (and thus recreating WES) each
+		 * time we should keep it if possible, i.e. if connection is already
+		 * established. Note that single neon_walreader object can switch
+		 * between local and remote reads multiple times during its lifetime,
+		 * so careful bookkeeping is needed here.
+		 */
+		rm_safekeeper_event_set(sk, false);
+		add_nwr_event_set(sk, nwr_events);
+	}
+	else
+	{
+		/*
+		 * Hack: we should always set 0 here, but for random reasons
+		 * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
+		 * some event. Since there is also no way to remove socket except
+		 * reconstructing the whole set, SafekeeperStateDesiredEvents instead
+		 * gives WL_SOCKET_CLOSED if socket exists. We never expect it to
+		 * trigger.
+		 *
+		 * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event
+		 * removal.
+		 */
+#if PG_VERSION_NUM >= 150000
+		Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
+		update_nwr_event_set(sk, WL_SOCKET_CLOSED);
+#else							/* pg 14 */
+		rm_safekeeper_event_set(sk, false);
+#endif
+	}
+	walprop_pg_update_event_set(sk, sk_events);
+}
+
+static void
+walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
+{
+	rm_safekeeper_event_set(to_remove, true);
+}
+
+/*
+ * A hacky way to remove single event from the event set. Can be called if event
+ * doesn't exist, does nothing in this case.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be
+ * avoided if possible.
+ *
+ * If is_sk is true, socket of connection to safekeeper is removed; otherwise
+ * socket of neon_walreader.
+ */
+static void
+rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
+{
+	WalProposer *wp = to_remove->wp;
+
+	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+		 to_remove->host, to_remove->port, is_sk);
+
+	/*
+	 * Shortpath for exiting if have nothing to do. We never call this
+	 * function with safekeeper socket not existing, but do that with neon
+	 * walreader socket.
+	 */
+	if ((is_sk && to_remove->eventPos == -1) ||
+		(!is_sk && to_remove->nwrEventPos == -1))
+	{
+		return;
+	}
+
+	/* Remove the existing event set, assign sk->eventPos = -1 */
+	walprop_pg_free_event_set(wp);
+
+	/* Re-initialize it without adding any safekeeper events */
+	wp->api.init_event_set(wp);
+
+	/*
+	 * loop through the existing safekeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		if (sk == to_remove)
+		{
+			if (is_sk)
+				sk->eventPos = -1;
+			else
+				sk->nwrEventPos = -1;
+		}
+
+		/*
+		 * If this safekeeper isn't offline, add events for it, except for the
+		 * event requested to remove.
+		 */
+		if (sk->state != SS_OFFLINE)
+		{
+			uint32		sk_events;
+			uint32		nwr_events;
+
+			SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+			if (sk != to_remove || !is_sk)
+			{
+				/* will set sk->eventPos */
+				wp->api.add_safekeeper_event_set(sk, sk_events);
+			}
+			else if ((sk != to_remove || is_sk) && nwr_events)
+			{
+				add_nwr_event_set(sk, nwr_events);
+			}
+		}
+	}
 }
 
 static int
@@ -1750,12 +1982,14 @@ static const walproposer_api walprop_pg = {
 	.conn_async_write = walprop_async_write,
 	.conn_blocking_write = walprop_blocking_write,
 	.recovery_download = WalProposerRecovery,
-	.wal_read = walprop_pg_wal_read,
 	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
-	.free_event_set = walprop_pg_free_event_set,
+	.wal_read = walprop_pg_wal_read,
+	.wal_reader_events = walprop_pg_wal_reader_events,
 	.init_event_set = walprop_pg_init_event_set,
 	.update_event_set = walprop_pg_update_event_set,
+	.active_state_update_event_set = walprop_pg_active_state_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
+	.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
 	.strong_random = walprop_pg_strong_random,
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,

From 14913c6443f36e9c94cab63698fdfd910a016148 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 8 Dec 2023 18:05:48 +0300
Subject: [PATCH 45/57] Adapt rust walproposer to neon_walreader.

---
 libs/walproposer/src/api_bindings.rs | 61 +++++++++++++++++++---------
 libs/walproposer/src/walproposer.rs  | 37 +++++++++++------
 2 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 77afe1e686..2f633243be 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -8,6 +8,7 @@ use std::ffi::CString;
 
 use crate::bindings::uint32;
 use crate::bindings::walproposer_api;
+use crate::bindings::NeonWALReadResult;
 use crate::bindings::PGAsyncReadResult;
 use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
@@ -191,21 +192,6 @@ extern "C" fn recovery_download(
     }
 }
 
-#[allow(clippy::unnecessary_cast)]
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-) {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_read(&mut (*sk), buf, startptr)
-    }
-}
-
 extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -214,11 +200,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn free_event_set(wp: *mut WalProposer) {
+#[allow(clippy::unnecessary_cast)]
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+    _errmsg: *mut *mut ::std::os::raw::c_char,
+) -> NeonWALReadResult {
     unsafe {
-        let callback_data = (*(*wp).config).callback_data;
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).free_event_set(&mut (*wp));
+        // TODO: errmsg is not forwarded
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_events(&mut (*sk))
     }
 }
 
@@ -238,6 +241,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
     }
 }
 
+extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).active_state_update_event_set(&mut (*sk));
+    }
+}
+
 extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -246,6 +257,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
     }
 }
 
+extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).rm_safekeeper_event_set(&mut (*sk));
+    }
+}
+
 extern "C" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
@@ -401,12 +420,14 @@ pub(crate) fn create_api() -> walproposer_api {
         conn_async_write: Some(conn_async_write),
         conn_blocking_write: Some(conn_blocking_write),
         recovery_download: Some(recovery_download),
-        wal_read: Some(wal_read),
         wal_reader_allocate: Some(wal_reader_allocate),
-        free_event_set: Some(free_event_set),
+        wal_read: Some(wal_read),
+        wal_reader_events: Some(wal_reader_events),
         init_event_set: Some(init_event_set),
         update_event_set: Some(update_event_set),
+        active_state_update_event_set: Some(active_state_update_event_set),
         add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
         wait_event_set: Some(wait_event_set),
         strong_random: Some(strong_random),
         get_redo_start_lsn: Some(get_redo_start_lsn),
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index f5723018d7..013400325d 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
 use crate::{
     api_bindings::{create_api, take_vec_u8, Level},
     bindings::{
-        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
-        WalProposerStart,
+        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
+        WalProposerFree, WalProposerStart,
     },
 };
 
@@ -90,15 +90,15 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
         todo!()
     }
 
-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
         todo!()
     }
 
-    fn free_event_set(&self, _wp: &mut WalProposer) {
+    fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
         todo!()
     }
 
@@ -110,10 +110,18 @@ pub trait ApiImpl {
         todo!()
     }
 
+    fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
     fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
         todo!()
     }
 
+    fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
     fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
         todo!()
     }
@@ -240,6 +248,7 @@ impl Drop for Wrapper {
 
 #[cfg(test)]
 mod tests {
+    use core::panic;
     use std::{
         cell::Cell,
         sync::{atomic::AtomicUsize, mpsc::sync_channel},
@@ -247,7 +256,7 @@ mod tests {
 
     use utils::id::TenantTimelineId;
 
-    use crate::{api_bindings::Level, walproposer::Wrapper};
+    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
 
     use super::ApiImpl;
 
@@ -355,12 +364,9 @@ mod tests {
             true
         }
 
-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("wal_reader_allocate")
-        }
-
-        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("free_event_set")
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
+            println!("wal_reader_allocate");
+            crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
         }
 
         fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
@@ -383,6 +389,13 @@ mod tests {
             self.wait_events.set(WaitEventsData { sk, event_mask });
         }
 
+        fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
+            println!(
+                "rm_safekeeper_event_set, sk={:?}",
+                sk as *mut crate::bindings::Safekeeper
+            );
+        }
+
         fn wait_event_set(
             &self,
             _: &mut crate::bindings::WalProposer,

From df760e6de5c2a398de3f00d7deba97d5db5fded4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 6 Dec 2023 10:12:19 +0300
Subject: [PATCH 46/57] Add test_lagging_sk.

---
 test_runner/fixtures/neon_fixtures.py    |  23 ++
 test_runner/regress/test_wal_acceptor.py | 295 +++++++++++++++++++----
 2 files changed, 267 insertions(+), 51 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a9133f1c9c..597e311e02 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -365,6 +365,12 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
+    def safe_psql_scalar(self, query) -> Any:
+        """
+        Execute query returning single row with single column.
+        """
+        return self.safe_psql(query)[0][0]
+
 
 @dataclass
 class AuthKeys:
@@ -2733,6 +2739,13 @@ class Endpoint(PgProtocol):
     ):
         self.stop()
 
+    # Checkpoints running endpoint and returns pg_wal size in MB.
+    def get_pg_wal_size(self):
+        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
+        self.safe_psql("checkpoint")
+        assert self.pgdata_dir is not None  # please mypy
+        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
+
 
 class EndpointFactory:
     """An object representing multiple compute endpoints."""
@@ -2931,6 +2944,13 @@ class Safekeeper:
         return segments
 
 
+# Walreceiver as returned by sk's timeline status endpoint.
+@dataclass
+class Walreceiver:
+    conn_id: int
+    state: str
+
+
 @dataclass
 class SafekeeperTimelineStatus:
     acceptor_epoch: int
@@ -2941,6 +2961,7 @@ class SafekeeperTimelineStatus:
     backup_lsn: Lsn
     peer_horizon_lsn: Lsn
     remote_consistent_lsn: Lsn
+    walreceivers: List[Walreceiver]
 
 
 @dataclass
@@ -3002,6 +3023,7 @@ class SafekeeperHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
         res.raise_for_status()
         resj = res.json()
+        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         return SafekeeperTimelineStatus(
             acceptor_epoch=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
@@ -3011,6 +3033,7 @@ class SafekeeperHttpClient(requests.Session):
             backup_lsn=Lsn(resj["backup_lsn"]),
             peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
             remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
+            walreceivers=walreceivers,
         )
 
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3c40a9cb3e..5a0856c69c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -419,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None):
         try:
             if f():
                 break
-        except Exception:
+        except Exception as e:
+            log.info(f"got exception while waiting for {desc}: {e}")
             pass
         elapsed = time.time() - started_at
         if elapsed > timeout:
@@ -1001,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
         endpoint.start()
 
 
+# Context manager which logs passed time on exit.
+class DurationLogger:
+    def __init__(self, desc):
+        self.desc = desc
+
+    def __enter__(self):
+        self.ts_before = time.time()
+
+    def __exit__(self, *exc):
+        log.info(f"{self.desc} finished in {time.time() - self.ts_before}s")
+
+
+# Context manager which logs WAL position change on exit.
+class WalChangeLogger:
+    def __init__(self, ep, desc_before):
+        self.ep = ep
+        self.desc_before = desc_before
+
+    def __enter__(self):
+        self.ts_before = time.time()
+        self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
+        log.info(f"{self.desc_before}, lsn_before={self.lsn_before}")
+
+    def __exit__(self, *exc):
+        lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
+        log.info(
+            f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s"
+        )
+
+
 # Test that we can create timeline with one safekeeper down and initialize it
-# later when some data already had been written.
+# later when some data already had been written. It is strictly weaker than
+# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute
+# download (recovery) and as such useful for development/testing.
 def test_late_init(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
@@ -1010,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
     sk1 = env.safekeepers[0]
     sk1.stop()
 
-    # create and insert smth while safekeeper is down...
-    env.neon_cli.create_branch("test_late_init")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_late_init")
     endpoint = env.endpoints.create_start("test_late_init")
+    # create and insert smth while safekeeper is down...
     endpoint.safe_psql("create table t(key int, value text)")
-    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
-    log.info("insert with safekeeper down done")
+    with WalChangeLogger(endpoint, "doing insert with sk1 down"):
+        endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
     endpoint.stop()  # stop compute
 
     # stop another safekeeper, and start one which missed timeline creation
@@ -1024,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
     sk1.start()
 
     # insert some more
-    endpoint = env.endpoints.create_start("test_late_init")
+    with DurationLogger("recovery"):
+        endpoint = env.endpoints.create_start("test_late_init")
     endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
 
+    wait_flush_lsn_align_by_ep(
+        env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]]
+    )
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id)
+
 
 # is timeline flush_lsn equal on provided safekeepers?
-def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
-    status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
-    status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(
-        f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
+def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
+    flush_lsns = [
+        sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
+        for sk_http_cli in sk_http_clis
+    ]
+    log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}")
+    return all([flush_lsns[0] == flsn for flsn in flush_lsns])
+
+
+def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
+    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+    return len(status.walreceivers) == 0
+
+
+# Assert by xxd that WAL on given safekeepers is identical. No compute must be
+# running for this to be reliable.
+def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
+    assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed"
+    sk_http_clis = [sk.http_client() for sk in sks]
+
+    # First check that term / flush_lsn are the same: it is easier to
+    # report/understand if WALs are different due to that.
+    statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
+    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
+        assert (
+            term_flush_lsns[0] == tfl
+        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+
+    # check that WALs are identic.
+    segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
+    for cmp_segs, sk in zip(segs[1:], sks[1:]):
+        assert (
+            segs[0] == cmp_segs
+        ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
+    log.info(f"comparing segs {segs[0]}")
+
+    sk0 = sks[0]
+    for sk in sks[1:]:
+        (_, mismatch, not_regular) = filecmp.cmpfiles(
+            sk0.timeline_dir(tenant_id, timeline_id),
+            sk.timeline_dir(tenant_id, timeline_id),
+            segs[0],
+            shallow=False,
+        )
+        log.info(
+            f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
+        )
+
+        for f in mismatch:
+            f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
+            f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
+            stdout_filename = "{}.filediff".format(f2)
+
+            with open(stdout_filename, "w") as stdout_f:
+                subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
+                subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+
+                cmd = "diff {}.hex {}.hex".format(f1, f2)
+                subprocess.run([cmd], stdout=stdout_f, shell=True)
+
+            assert (mismatch, not_regular) == (
+                [],
+                [],
+            ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic"
+
+
+# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is
+# running. ep is stopped by this function. This is used in tests which check
+# binary equality of WAL segments on safekeepers; which is inherently racy as
+# shutting down endpoint might always write some WAL which can get to only one
+# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if
+# it has changed.
+def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks):
+    sk_http_clis = [sk.http_client() for sk in sks]
+    # First wait for the alignment.
+    wait(
+        partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id),
+        "flush_lsn to get aligned",
     )
-    return status1.flush_lsn == status2.flush_lsn
+    ep.stop()  # then stop endpoint
+    # Even if there is no compute, there might be some in flight data; ensure
+    # all walreceivers die before rechecking.
+    for sk_http_cli in sk_http_clis:
+        wait(
+            partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id),
+            "walreceivers to be gone",
+        )
+    # Now recheck again flush_lsn and exit if it is good
+    if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
+        return
+    # Otherwise repeat.
+    log.info("flush_lsn changed during endpoint shutdown; retrying alignment")
+    ep = env.endpoints.create_start(branch)
 
 
-# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
-# 1) walproposer can't recover node if it misses WAL written by previous computes, but
-#    still starts up and functions normally if two other sks are ok.
-# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
-#    normally if two other sks are ok.
-# 3) Lagged safekeeper can still recover by peer recovery.
-def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
-    pass
+# Test behaviour with one safekeeper down and missing a lot of WAL, exercising
+# neon_walreader and checking that pg_wal never bloats. Namely, ensures that
+# compute doesn't keep many WAL for lagging sk, but still can recover it with
+# neon_walreader, in two scenarious: a) WAL never existed on compute (it started
+# on basebackup LSN later than lagging sk position) though segment file exists
+# b) WAL had been recycled on it and segment file doesn't exist.
+#
+# Also checks along the way that whenever there are two sks alive, compute
+# should be able to commit.
+def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
+    # inserts ~20MB of WAL, a bit more than a segment.
+    def fill_segment(ep):
+        ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'")
+
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    (sk1, sk2, sk3) = env.safekeepers
+
+    # create and insert smth while safekeeper is down...
+    sk1.stop()
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_lagging_sk")
+    ep = env.endpoints.create_start("test_lagging_sk")
+    ep.safe_psql("create table t(key int, value text)")
+    # make small insert to be on the same segment
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
+    log.info("insert with safekeeper down done")
+    ep.stop()  # stop compute
+
+    # Stop another safekeeper, and start one which missed timeline creation.
+    sk2.stop()
+    sk1.start()
+
+    # Start new ep and insert some more. neon_walreader should download WAL for
+    # sk1 because it should be filled since the horizon (initial LSN) which is
+    # earlier than basebackup LSN.
+    ep = env.endpoints.create_start("test_lagging_sk")
+    ep.safe_psql("insert into t select generate_series(1,100), 'payload'")
+    # stop ep and ensure WAL is identical after recovery.
+    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
+
+    # Now repeat insertion with sk1 down, but with inserting more data to check
+    # that WAL on compute is removed.
+    sk1.stop()
+    sk2.start()
+
+    # min_wal_size must be at least 2x segment size.
+    min_wal_config = [
+        "min_wal_size=32MB",
+        "max_wal_size=32MB",
+        "wal_keep_size=0",
+        "log_checkpoints=on",
+    ]
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=min_wal_config,
+    )
+    with WalChangeLogger(ep, "doing large insert with sk1 down"):
+        for _ in range(0, 5):
+            fill_segment(ep)
+    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
+    assert ep.get_pg_wal_size() < 16 * 2.5
+
+    sk2.stop()  # stop another sk to ensure sk1 and sk3 can work
+    sk1.start()
+    with DurationLogger("recovery"):
+        ep.safe_psql("insert into t select generate_series(1,100), 'payload'")  # forces recovery
+    # stop ep and ensure WAL is identical after recovery.
+    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
+
+    # Now do the same with different safekeeper sk2 down, and restarting ep
+    # before recovery (again scenario when recovery starts below basebackup_lsn,
+    # but multi segment now).
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
+    )
+    with WalChangeLogger(ep, "doing large insert with sk2 down"):
+        for _ in range(0, 5):
+            fill_segment(ep)
+    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
+    assert ep.get_pg_wal_size() < 16 * 2.5
+
+    ep.stop()
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=min_wal_config,
+    )
+    sk2.start()
+    with DurationLogger("recovery"):
+        wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
 
 
 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
@@ -1065,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     sk2_http_cli = sk2.http_client()
     # ensure tli gets created on sk1, peer recovery won't do that
     wait(
-        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
         "flush_lsn to get aligned",
     )
 
@@ -1087,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
 
     # wait a bit, lsns shouldn't change
-    # time.sleep(5)
+    time.sleep(2)
     sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
     sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
     log.info(
@@ -1098,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     # now restart safekeeper with peer recovery enabled and wait for recovery
     sk1.stop().start(extra_opts=["--peer-recovery=true"])
     wait(
-        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
         "flush_lsn to get aligned",
     )
 
-    # check that WALs are identic after recovery
-    segs = sk1.list_segments(tenant_id, timeline_id)
-    log.info(f"segs are {segs}")
-
-    (_, mismatch, not_regular) = filecmp.cmpfiles(
-        sk1.timeline_dir(tenant_id, timeline_id),
-        sk2.timeline_dir(tenant_id, timeline_id),
-        segs,
-        shallow=False,
-    )
-    log.info(
-        f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
-    )
-
-    for f in mismatch:
-        f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
-        f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
-        stdout_filename = "{}.filediff".format(f2)
-
-        with open(stdout_filename, "w") as stdout_f:
-            subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
-            subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
-
-            cmd = "diff {}.hex {}.hex".format(f1, f2)
-            subprocess.run([cmd], stdout=stdout_f, shell=True)
-
-    assert (mismatch, not_regular) == ([], [])
+    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
 
     # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
     env.safekeepers[2].stop()

From 9c493869c786ee2e5a4e099bef7f5273b0b68746 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Dec 2023 17:08:36 +0300
Subject: [PATCH 47/57] Perform synchronous WAL download in wp only for logical
 replication.

wp -> sk communication now uses neon_walreader which will fetch missing WAL on
demand from safekeepers, so doesn't need this anymore. Also, cap WAL download by
max_slot_wal_keep_size to be able to start compute if lag is too high.
---
 libs/walproposer/src/api_bindings.rs | 19 +------
 libs/walproposer/src/walproposer.rs  | 10 +++-
 pgxn/neon/walproposer.c              | 28 +++-------
 pgxn/neon/walproposer.h              | 15 ++----
 pgxn/neon/walproposer_pg.c           | 78 +++++++++++++++++++++-------
 5 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 2f633243be..e884f8438a 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -14,7 +14,6 @@ use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
 use crate::bindings::Size;
 use crate::bindings::StringInfoData;
-use crate::bindings::TimeLineID;
 use crate::bindings::TimestampTz;
 use crate::bindings::WalProposer;
 use crate::bindings::WalProposerConnStatusType;
@@ -179,16 +178,11 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(
-    sk: *mut Safekeeper,
-    _timeline: TimeLineID,
-    startpos: XLogRecPtr,
-    endpos: XLogRecPtr,
-) -> bool {
+extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*sk), startpos, endpos)
+        (*api).recovery_download(&mut (*wp), &mut (*sk))
     }
 }
 
@@ -354,14 +348,6 @@ extern "C" fn log_internal(
     }
 }
 
-extern "C" fn after_election(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).after_election(&mut (*wp))
-    }
-}
-
 #[derive(Debug)]
 pub enum Level {
     Debug5,
@@ -435,7 +421,6 @@ pub(crate) fn create_api() -> walproposer_api {
         process_safekeeper_feedback: Some(process_safekeeper_feedback),
         confirm_wal_streamed: Some(confirm_wal_streamed),
         log_internal: Some(log_internal),
-        after_election: Some(after_election),
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 013400325d..87001c9c66 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -86,7 +86,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+    fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
         todo!()
     }
 
@@ -364,6 +364,14 @@ mod tests {
             true
         }
 
+        fn recovery_download(
+            &self,
+            _wp: &mut crate::bindings::WalProposer,
+            _sk: &mut crate::bindings::Safekeeper,
+        ) -> bool {
+            true
+        }
+
         fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
             println!("wal_reader_allocate");
             crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 4fb9a46d15..5874d199f9 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -809,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 	else if (wp->n_votes > wp->quorum)
 	{
-		/* recovery already performed, just start streaming */
+		/* already elected, start streaming */
 		SendProposerElected(sk);
 	}
 	else
@@ -835,21 +835,16 @@ HandleElectedProposer(WalProposer *wp)
 	DetermineEpochStartLsn(wp);
 
 	/*
-	 * Check if not all safekeepers are up-to-date, we need to download WAL
-	 * needed to synchronize them
+	 * Synchronously download WAL from the most advanced safekeeper. We do
+	 * that only for logical replication (and switching logical walsenders to
+	 * neon_walreader is a todo.)
 	 */
-	if (wp->truncateLsn < wp->propEpochStartLsn)
+	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		walprop_log(LOG,
-					"start recovery because truncateLsn=%X/%X is not "
-					"equal to epochStartLsn=%X/%X",
-					LSN_FORMAT_ARGS(wp->truncateLsn),
-					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
-		/* Perform recovery */
-		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
-			walprop_log(FATAL, "Failed to recover state");
+		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
-	else if (wp->config->syncSafekeepers)
+
+	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */
 		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
@@ -1047,13 +1042,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}
-
-	/*
-	 * WalProposer has just elected itself and initialized history, so we can
-	 * call election callback. Usually it updates truncateLsn to fetch WAL for
-	 * logical replication.
-	 */
-	wp->api.after_election(wp);
 }
 
 /*
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index a90e87b54f..2b2c252a18 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -485,8 +485,11 @@ typedef struct walproposer_api
 	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
 	bool		(*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
 
-	/* Download WAL from startpos to endpos and make it available locally. */
-	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+	/*
+	 * Download WAL before basebackup for logical walsenders from sk, if
+	 * needed
+	 */
+	bool		(*recovery_download) (WalProposer *wp, Safekeeper *sk);
 
 	/* Allocate WAL reader. */
 	void		(*wal_reader_allocate) (Safekeeper *sk);
@@ -556,14 +559,6 @@ typedef struct walproposer_api
 	 * handled by elog().
 	 */
 	void		(*log_internal) (WalProposer *wp, int level, const char *line);
-
-	/*
-	 * Called right after the proposer was elected, but before it started
-	 * recovery and sent ProposerElected message to the safekeepers.
-	 *
-	 * Used by logical replication to update truncateLsn.
-	 */
-	void		(*after_election) (WalProposer *wp);
 } walproposer_api;
 
 /*
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 6199def43f..734e627b4d 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -101,6 +101,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
 
+static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
+
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -1211,16 +1213,38 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	}
 }
 
-/*
- * Receive WAL from most advanced safekeeper
- */
+/* Download WAL before basebackup for logical walsenders from sk, if needed */
 static bool
-WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 {
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 	char		conninfo[MAXCONNINFO];
+	TimeLineID	timeline;
+	XLogRecPtr	startpos;
+	XLogRecPtr	endpos;
+	uint64		download_range_mb;
+
+	startpos = GetLogRepRestartLSN(wp);
+	if (startpos == InvalidXLogRecPtr)
+		return true;			/* recovery not needed */
+	endpos = wp->propEpochStartLsn;
+
+	/*
+	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
+	 * avoid risk of exploding pg_wal. Logical replication won't work until
+	 * recreated, but at least compute would start; this also follows
+	 * max_slot_wal_keep_size semantics.
+	 */
+	download_range_mb = (endpos - startpos) / 1024 / 1024;
+	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
+	{
+		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
+		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
+					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
+	}
+	timeline = wp->greetRequest.timeline;
 
 	if (!neon_auth_token)
 	{
@@ -1250,7 +1274,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
 		return false;
 	}
 	elog(LOG,
-		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
 		 "%d",
 		 sk->host, sk->port, (uint32) (startpos >> 32),
 		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
@@ -1928,15 +1952,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
 }
 
-static void
-walprop_pg_after_election(WalProposer *wp)
+static XLogRecPtr
+GetLogRepRestartLSN(WalProposer *wp)
 {
 	FILE	   *f;
-	XLogRecPtr	lrRestartLsn;
+	XLogRecPtr	lrRestartLsn = InvalidXLogRecPtr;
 
 	/* We don't need to do anything in syncSafekeepers mode. */
 	if (wp->config->syncSafekeepers)
-		return;
+		return InvalidXLogRecPtr;
 
 	/*
 	 * If there are active logical replication subscription we need to provide
@@ -1944,25 +1968,40 @@ walprop_pg_after_election(WalProposer *wp)
 	 * replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
-	if (f != NULL && !wp->config->syncSafekeepers)
+	if (f != NULL)
 	{
-		size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+		size_t		rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+
 		fclose(f);
 		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			uint64		download_range_mb;
 
-			if (max_slot_wal_keep_size_mb <= 0 || lrRestartLsn + max_slot_wal_keep_size_mb*MB > wp->truncateLsn)
+			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+
+			/*
+			 * If we need to download more than a max_slot_wal_keep_size,
+			 * don't do it to avoid risk of exploding pg_wal. Logical
+			 * replication won't work until recreated, but at least compute
+			 * would start; this also follows max_slot_wal_keep_size
+			 * semantics.
+			 */
+			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
+			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 			{
-				/*
-				 * start from the beginning of the segment to fetch page headers
-				 * verifed by XLogReader
-				 */
-				lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-				wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				return InvalidXLogRecPtr;
 			}
+
+			/*
+			 * start from the beginning of the segment to fetch page headers
+			 * verifed by XLogReader
+			 */
+			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
 		}
 	}
+	return lrRestartLsn;
 }
 
 static const walproposer_api walprop_pg = {
@@ -1997,5 +2036,4 @@ static const walproposer_api walprop_pg = {
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
 	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
 	.log_internal = walprop_pg_log_internal,
-	.after_election = walprop_pg_after_election,
 };

From 854df0f566e717bb2fc640201a8c11cbd0d2d125 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Dec 2023 17:40:42 +0300
Subject: [PATCH 48/57] Do PQgetCopyData before PQconsumeInput in
 libpqwp_async_read.

To avoid a lot of redundant memmoves and bloated input buffer.

fixes https://github.com/neondatabase/neon/issues/6055
---
 pgxn/neon/walproposer_pg.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 734e627b4d..0999156431 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -820,8 +820,7 @@ walprop_flush(Safekeeper *sk)
 PGAsyncReadResult
 libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 {
-
-	int			result;
+	int			rawlen;
 
 	if (conn->recvbuf != NULL)
 	{
@@ -829,12 +828,19 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 		conn->recvbuf = NULL;
 	}
 
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
+	/* Try to receive a CopyData message */
+	rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
+	if (rawlen == 0)
 	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
+		/* Try consuming some data. */
+		if (!PQconsumeInput(conn->pg_conn))
+		{
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_FAIL;
+		}
+		/* Now that we've consumed some input, try again */
+		rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
 	}
 
 	/*
@@ -848,7 +854,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
+	switch (rawlen)
 	{
 		case 0:
 			*amount = 0;
@@ -883,7 +889,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 			return PG_ASYNC_READ_FAIL;
 		default:
 			/* Positive values indicate the size of the returned result */
-			*amount = result;
+			*amount = rawlen;
 			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}

From 1f1c50e8c7f737213bdc7c670c7ef204c52a6f9c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Dec 2023 11:25:44 +0300
Subject: [PATCH 49/57] Don't re-add neon_walreader socket to waiteventset if
 possible.

Should make recovery slightly more efficient (likely negligibly).
---
 pgxn/neon/neon_walreader.c | 11 +++++++++++
 pgxn/neon/neon_walreader.h |  1 +
 pgxn/neon/walproposer.h    | 12 +++++++++++-
 pgxn/neon/walproposer_pg.c | 32 ++++++++++++++++----------------
 4 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index f035c2928f..f7ec9e5bfa 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -556,6 +556,17 @@ NeonWALReaderSocket(NeonWALReader *state)
 	return PQsocket(state->wp_conn->pg_conn);
 }
 
+/*
+ * Whether remote connection is established. Once this is done, until successful
+ * local read or error socket is stable and user can update socket events
+ * instead of readding it each time.
+ */
+bool
+NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
+{
+	return state->rem_state == RS_ESTABLISHED;
+}
+
 /*
  * Returns events user should wait on connection socket or 0 if remote
  * connection is not active.
diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h
index 805c94fc53..6be9f149aa 100644
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -24,6 +24,7 @@ extern void NeonWALReaderFree(NeonWALReader *state);
 extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
 extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
 extern uint32 NeonWALReaderEvents(NeonWALReader *state);
+extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
 extern char *NeonWALReaderErrMsg(NeonWALReader *state);
 
 #endif							/* __NEON_WALREADER_H__ */
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 2b2c252a18..4c2b53a1ef 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -377,9 +377,19 @@ typedef struct Safekeeper
 	int			eventPos;
 
 	/*
-	 * Neon WAL reader position in wait event set, or -1 if no socket.
+	 * Neon WAL reader position in wait event set, or -1 if no socket. Note
+	 * that event must be removed not only on error/failure, but also on
+	 * successful *local* read, as next read might again be remote, but with
+	 * different socket.
 	 */
 	int			nwrEventPos;
+
+	/*
+	 * Per libpq docs, during connection establishment socket might change,
+	 * remember here if it is stable to avoid readding to the event set if
+	 * possible. Must be reset whenever nwr event is deleted.
+	 */
+	bool		nwrConnEstablished;
 #endif
 
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 0999156431..57be2d8d96 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1541,6 +1541,7 @@ walprop_pg_free_event_set(WalProposer *wp)
 	{
 		wp->safekeeper[i].eventPos = -1;
 		wp->safekeeper[i].nwrEventPos = -1;
+		wp->safekeeper[i].nwrConnEstablished = false;
 	}
 }
 
@@ -1561,6 +1562,7 @@ walprop_pg_init_event_set(WalProposer *wp)
 	{
 		wp->safekeeper[i].eventPos = -1;
 		wp->safekeeper[i].nwrEventPos = -1;
+		wp->safekeeper[i].nwrConnEstablished = false;
 	}
 }
 
@@ -1578,6 +1580,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
 {
 	Assert(sk->nwrEventPos == -1);
 	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
+	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
 	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
@@ -1619,14 +1622,19 @@ walprop_pg_active_state_update_event_set(Safekeeper *sk)
 	if (sk->active_state == SS_ACTIVE_READ_WAL)
 	{
 		/*
-		 * TODO: instead of reattaching socket (and thus recreating WES) each
-		 * time we should keep it if possible, i.e. if connection is already
-		 * established. Note that single neon_walreader object can switch
-		 * between local and remote reads multiple times during its lifetime,
-		 * so careful bookkeeping is needed here.
+		 * If conn is established and socket is thus stable, update the event
+		 * directly; otherwise re-add it.
 		 */
-		rm_safekeeper_event_set(sk, false);
-		add_nwr_event_set(sk, nwr_events);
+		if (sk->nwrConnEstablished)
+		{
+			Assert(sk->nwrEventPos != -1);
+			update_nwr_event_set(sk, nwr_events);
+		}
+		else
+		{
+			rm_safekeeper_event_set(sk, false);
+			add_nwr_event_set(sk, nwr_events);
+		}
 	}
 	else
 	{
@@ -1701,14 +1709,6 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 	{
 		Safekeeper *sk = &wp->safekeeper[i];
 
-		if (sk == to_remove)
-		{
-			if (is_sk)
-				sk->eventPos = -1;
-			else
-				sk->nwrEventPos = -1;
-		}
-
 		/*
 		 * If this safekeeper isn't offline, add events for it, except for the
 		 * event requested to remove.
@@ -1725,7 +1725,7 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 				/* will set sk->eventPos */
 				wp->api.add_safekeeper_event_set(sk, sk_events);
 			}
-			else if ((sk != to_remove || is_sk) && nwr_events)
+			if ((sk != to_remove || is_sk) && nwr_events)
 			{
 				add_nwr_event_set(sk, nwr_events);
 			}

From d5fbfe2399cc85f461fc6c3b3a32077d0b9ebd73 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Dec 2023 16:02:42 +0300
Subject: [PATCH 50/57] Remove test_wal_deleted_after_broadcast.

It is superseded by stronger test_lagging_sk.
---
 test_runner/regress/test_wal_acceptor.py | 54 ------------------------
 1 file changed, 54 deletions(-)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5a0856c69c..cf8df389c8 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1557,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
-# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted
-# to all safekeepers. This test checks that compute WAL can fit into small number
-# of WAL segments.
-def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder):
-    # used to calculate delta in collect_stats
-    last_lsn = Lsn(0)
-
-    # returns pg_wal size in MB
-    def collect_stats(endpoint: Endpoint, cur, enable_logs=True):
-        nonlocal last_lsn
-        assert endpoint.pgdata_dir is not None
-
-        log.info("executing INSERT to generate WAL")
-        current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024
-        if enable_logs:
-            lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024
-            log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB")
-        last_lsn = current_lsn
-        return pg_wal_size_mb
-
-    # generates about ~20MB of WAL, to create at least one new segment
-    def generate_wal(cur):
-        cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'")
-
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch("test_wal_deleted_after_broadcast")
-    # Adjust checkpoint config to prevent keeping old WAL segments
-    endpoint = env.endpoints.create_start(
-        "test_wal_deleted_after_broadcast",
-        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-    cur.execute("CREATE TABLE t(key int, value text)")
-
-    collect_stats(endpoint, cur)
-
-    # generate WAL to simulate normal workload
-    for _ in range(5):
-        generate_wal(cur)
-        collect_stats(endpoint, cur)
-
-    log.info("executing checkpoint")
-    cur.execute("CHECKPOINT")
-    wal_size_after_checkpoint = collect_stats(endpoint, cur)
-
-    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
-    assert wal_size_after_checkpoint < 16 * 2.5
-
-
 @pytest.mark.parametrize("auth_enabled", [False, True])
 def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     neon_env_builder.auth_enabled = auth_enabled

From bfc98f36e34467c271afe851bc23e90b95d0ead6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 16 Dec 2023 00:00:49 +0300
Subject: [PATCH 51/57] Refactor handling responses in walproposer.

Remove confirm_wal_streamed; we already apply both write and flush positions of
the slot to commit_lsn which is fine because 1) we need to wake up waiters 2)
committed WAL can be fetched from safekeepers by neon_walreader now.
---
 libs/walproposer/src/api_bindings.rs |  9 ---
 libs/walproposer/src/walproposer.rs  |  4 --
 pgxn/neon/walproposer.c              | 33 ++++-------
 pgxn/neon/walproposer.h              |  6 --
 pgxn/neon/walproposer_pg.c           | 85 ++++++++++++++--------------
 5 files changed, 56 insertions(+), 81 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index e884f8438a..1f7bf952dc 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -326,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
     }
 }
 
-extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).confirm_wal_streamed(&mut (*wp), lsn)
-    }
-}
-
 extern "C" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
@@ -419,7 +411,6 @@ pub(crate) fn create_api() -> walproposer_api {
         get_redo_start_lsn: Some(get_redo_start_lsn),
         finish_sync_safekeepers: Some(finish_sync_safekeepers),
         process_safekeeper_feedback: Some(process_safekeeper_feedback),
-        confirm_wal_streamed: Some(confirm_wal_streamed),
         log_internal: Some(log_internal),
     }
 }
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 87001c9c66..35c8f6904d 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,10 +142,6 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
-        todo!()
-    }
-
     fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
         todo!()
     }
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 5874d199f9..7fb0cab9a0 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1643,35 +1643,26 @@ static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
 	XLogRecPtr	minQuorumLsn;
-	XLogRecPtr	minFlushLsn;
+	XLogRecPtr	candidateTruncateLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
 
 	/*
-	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the
-	 * beginning of the record, which simplifies decoding on the far end.
+	 * Try to advance truncateLsn -- the last record flushed to all
+	 * safekeepers.
 	 *
-	 * Advanced truncateLsn should be not further than nearest commitLsn. This
-	 * prevents surprising violation of truncateLsn <= commitLsn invariant
-	 * which might occur because 1) truncateLsn can be advanced immediately
-	 * once chunk is broadcast to all safekeepers, and commitLsn generally
-	 * can't be advanced based on feedback from safekeeper who is still in the
-	 * previous epoch (similar to 'leader can't commit entries from previous
-	 * term' in Raft); 2) chunks we read from WAL and send are plain sheets of
-	 * bytes, but safekeepers ack only on record boundaries.
+	 * Advanced truncateLsn should be not higher than commitLsn. This prevents
+	 * surprising violation of truncateLsn <= commitLsn invariant which might
+	 * occur because commitLsn generally can't be advanced based on feedback
+	 * from safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2)
 	 */
-	minFlushLsn = CalculateMinFlushLsn(wp);
-	if (minFlushLsn > wp->truncateLsn)
+	candidateTruncateLsn = CalculateMinFlushLsn(wp);
+	candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
+	if (candidateTruncateLsn > wp->truncateLsn)
 	{
-		wp->truncateLsn = minFlushLsn;
-
-		/*
-		 * Advance the replication slot to free up old WAL files. Note that
-		 * slot doesn't exist if we are in syncSafekeepers mode.
-		 */
-		wp->api.confirm_wal_streamed(wp, wp->truncateLsn);
+		wp->truncateLsn = candidateTruncateLsn;
 	}
 
 	/*
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 4c2b53a1ef..6d478076fe 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -557,12 +557,6 @@ typedef struct walproposer_api
 	 */
 	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
 
-	/*
-	 * Called on peer_horizon_lsn updates. Used to advance replication slot
-	 * and to free up disk space by deleting unnecessary WAL.
-	 */
-	void		(*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
-
 	/*
 	 * Write a log message to the internal log processor. This is used only
 	 * when walproposer is compiled as a library. Otherwise, all logging is
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 57be2d8d96..10c740840f 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1812,7 +1812,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 }
 
 /*
- * Get PageserverFeedback fields from the most advanced safekeeper
+ * Choose most advanced PageserverFeedback and set it to *rf.
  */
 static void
 GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
@@ -1842,8 +1842,6 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
 		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
 		 rf->replytime);
-
-	replication_feedback_set(rf);
 }
 
 /*
@@ -1883,63 +1881,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
+/*
+ * Based on commitLsn and safekeeper responses including pageserver feedback,
+ * 1) Propagate cluster size received from ps to ensure the limit.
+ * 2) Propagate pageserver LSN positions to ensure backpressure limits.
+ * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters).
+ * 4) Propagate hot standby feedback.
+ *
+ * None of that is functional in sync-safekeepers.
+ */
 static void
 walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 {
 	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	diskConsistentLsn;
+	XLogRecPtr	oldDiskConsistentLsn;
 
-	diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
+	if (wp->config->syncSafekeepers)
+		return;
 
-	if (!wp->config->syncSafekeepers)
+	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
+
+	/* Get PageserverFeedback fields from the most advanced safekeeper */
+	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
+	replication_feedback_set(&quorumFeedback.rf);
+	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
+
+	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
-		/* Get PageserverFeedback fields from the most advanced safekeeper */
-		GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-	}
-
-	if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
-	{
-
 		if (commitLsn > quorumFeedback.flushLsn)
 			quorumFeedback.flushLsn = commitLsn;
 
-		/* advance the replication slot */
-		if (!wp->config->syncSafekeepers)
-			ProcessStandbyReply(
-			/* write_lsn -  This is what durably stored in WAL service. */
-								quorumFeedback.flushLsn,
-			/* flush_lsn - This is what durably stored in WAL service. */
-								quorumFeedback.flushLsn,
+		/*
+		 * Advance the replication slot to commitLsn. WAL before it is
+		 * hardened and will be fetched from one of safekeepers by
+		 * neon_walreader if needed.
+		 *
+		 * Also wakes up syncrep waiters.
+		 */
+		ProcessStandbyReply(
+		/* write_lsn -  This is what durably stored in WAL service. */
+							quorumFeedback.flushLsn,
+		/* flush_lsn - This is what durably stored in WAL service. */
+							quorumFeedback.flushLsn,
 
-			/*
-			 * apply_lsn - This is what processed and durably saved at*
-			 * pageserver.
-			 */
-								quorumFeedback.rf.disk_consistent_lsn,
-								walprop_pg_get_current_timestamp(wp), false);
+		/*
+		 * apply_lsn - This is what processed and durably saved at*
+		 * pageserver.
+		 */
+							quorumFeedback.rf.disk_consistent_lsn,
+							walprop_pg_get_current_timestamp(wp), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
 	{
 		quorumFeedback.hs = hsFeedback;
-		if (!wp->config->syncSafekeepers)
-			ProcessStandbyHSFeedback(hsFeedback.ts,
-									 XidFromFullTransactionId(hsFeedback.xmin),
-									 EpochFromFullTransactionId(hsFeedback.xmin),
-									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+		ProcessStandbyHSFeedback(hsFeedback.ts,
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 }
 
-static void
-walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
-{
-	if (MyReplicationSlot)
-		PhysicalConfirmReceivedLocation(lsn);
-}
-
 static XLogRecPtr
 walprop_pg_get_redo_start_lsn(WalProposer *wp)
 {
@@ -2040,6 +2044,5 @@ static const walproposer_api walprop_pg = {
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
-	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
 	.log_internal = walprop_pg_log_internal,
 };

From ddc431fc8f5cd48073fad5f1f1246cdc198e6954 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 26 Dec 2023 12:03:42 +0300
Subject: [PATCH 52/57] pgindent walproposer condvar comment

---
 pgxn/neon/walproposer_pg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 10c740840f..7773aabfab 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1748,8 +1748,8 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 		ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
 
 	/*
-	 * Now that we prepared the condvar, check flush ptr again -- it might have
-	 * changed before we subscribed to cv so we missed the wakeup.
+	 * Now that we prepared the condvar, check flush ptr again -- it might
+	 * have changed before we subscribed to cv so we missed the wakeup.
 	 *
 	 * Do that only when we're interested in new WAL: without sync-safekeepers
 	 * and if election already passed.

From 6e40900569df5c09763034198990560bc1eee6aa Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 14 Dec 2023 15:08:14 +0000
Subject: [PATCH 53/57] Manage pgbouncer configuration from compute_ctl: - add
 pgbouncer_settings section to compute spec; - add pgbouncer-connstr option to
 compute_ctl. - add pgbouncer-ini-path option to compute_ctl. Default:
 /etc/pgbouncer/pgbouncer.ini

Apply pgbouncer config on compute start and respec to override default spec.

Save pgbouncer config updates to pgbouncer.ini to preserve them across pgbouncer restarts.
---
 Cargo.lock                               | 67 +++++++++++++++++++++++
 compute_tools/Cargo.toml                 |  1 +
 compute_tools/src/bin/compute_ctl.rs     | 26 ++++++++-
 compute_tools/src/compute.rs             | 56 +++++++++++++++++++
 compute_tools/src/pg_helpers.rs          | 69 +++++++++++++++++++++++-
 control_plane/src/endpoint.rs            |  1 +
 deny.toml                                |  1 +
 libs/compute_api/src/spec.rs             |  2 +
 libs/compute_api/tests/cluster_spec.json |  4 ++
 vm-image-spec.yaml                       |  1 +
 workspace_hack/Cargo.toml                |  4 ++
 11 files changed, 230 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0be6d5d183..abd87dc0da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1168,6 +1168,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
+ "rust-ini",
  "serde",
  "serde_json",
  "tar",
@@ -1201,6 +1202,26 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
 
+[[package]]
+name = "const-random"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom 0.2.11",
+ "once_cell",
+ "tiny-keccak",
+]
+
 [[package]]
 name = "const_fn"
 version = "0.4.9"
@@ -1433,6 +1454,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
 [[package]]
 name = "crypto-bigint"
 version = "0.4.9"
@@ -1575,6 +1602,15 @@ dependencies = [
  "syn 2.0.32",
 ]
 
+[[package]]
+name = "dlv-list"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
+dependencies = [
+ "const-random",
+]
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -3043,6 +3079,16 @@ dependencies = [
  "tokio-stream",
 ]
 
+[[package]]
+name = "ordered-multimap"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
+dependencies = [
+ "dlv-list",
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "os_info"
 version = "3.7.0"
@@ -4216,6 +4262,16 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "rust-ini"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
+dependencies = [
+ "cfg-if",
+ "ordered-multimap",
+]
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.23"
@@ -5170,6 +5226,15 @@ dependencies = [
  "time-core",
 ]
 
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -6337,6 +6402,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
+ "getrandom 0.2.11",
  "hex",
  "hmac",
  "hyper",
@@ -6348,6 +6414,7 @@ dependencies = [
  "num-bigint",
  "num-integer",
  "num-traits",
+ "once_cell",
  "prost",
  "rand 0.8.5",
  "regex",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 18b30810b0..142fa08495 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -39,3 +39,4 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
+rust-ini = "0.20.0"
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index ce7345d5be..436db59088 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,9 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r http://pg-ext-s3-gateway
+//!             -r http://pg-ext-s3-gateway \
+//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
+//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
 //! ```
 //!
 use std::collections::HashMap;
@@ -99,6 +101,9 @@ fn main() -> Result<()> {
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
 
+    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
+    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
+
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
     // tracing context.
@@ -209,6 +214,8 @@ fn main() -> Result<()> {
         ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
         ext_download_progress: RwLock::new(HashMap::new()),
         build_tag,
+        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
+        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
     };
     let compute = Arc::new(compute_node);
 
@@ -493,6 +500,23 @@ fn cli() -> clap::Command {
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
+        .arg(
+            Arg::new("pgbouncer-connstr")
+                .long("pgbouncer-connstr")
+                .default_value(
+                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
+                )
+                .value_name("PGBOUNCER_CONNSTR"),
+        )
+        .arg(
+            Arg::new("pgbouncer-ini-path")
+                .long("pgbouncer-ini-path")
+                // Note: this doesn't match current path for pgbouncer.ini.
+                // Until we fix it, we need to pass the path explicitly
+                // or this will be effectively no-op.
+                .default_value("/etc/pgbouncer.ini")
+                .value_name("PGBOUNCER_INI_PATH"),
+        )
 }
 
 #[test]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index b39a800f14..cd7be0520e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -7,6 +7,7 @@ use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
 use std::sync::{Condvar, Mutex, RwLock};
+use std::thread;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
@@ -64,6 +65,10 @@ pub struct ComputeNode {
     // key: ext_archive_name, value: started download time, download_completed?
     pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
     pub build_tag: String,
+    // connection string to pgbouncer to change settings
+    pub pgbouncer_connstr: Option<String>,
+    // path to pgbouncer.ini to change settings
+    pub pgbouncer_ini_path: Option<String>,
 }
 
 // store some metrics about download size that might impact startup time
@@ -737,6 +742,31 @@ impl ComputeNode {
     pub fn reconfigure(&self) -> Result<()> {
         let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
 
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);
+
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("failed to create rt");
+
+            // Spawn a thread to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let _handle = thread::spawn(move || {
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                }
+            });
+        }
+
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -791,6 +821,32 @@ impl ComputeNode {
             pspec.timeline_id,
         );
 
+        // tune pgbouncer
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);
+
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("failed to create rt");
+
+            // Spawn a thread to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let _handle = thread::spawn(move || {
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                }
+            });
+        }
+
         info!(
             "start_compute spec.remote_extensions {:?}",
             pspec.spec.remote_extensions
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index b79e516650..0b0e137c03 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -9,9 +9,11 @@ use std::process::Child;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Result};
+use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tracing::{debug, instrument};
+use tokio_postgres::NoTls;
+use tracing::{debug, error, info, instrument};
 
 use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
@@ -359,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
 
     Ok(())
 }
+
+/// Update pgbouncer.ini with provided options
+pub fn update_pgbouncer_ini(
+    pgbouncer_config: HashMap<String, String>,
+    pgbouncer_ini_path: &str,
+) -> Result<()> {
+    let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
+    let section = conf.section_mut(Some("pgbouncer")).unwrap();
+
+    for (option_name, value) in pgbouncer_config.iter() {
+        section.insert(option_name, value);
+    }
+
+    conf.write_to_file(pgbouncer_ini_path)?;
+    Ok(())
+}
+
+/// Tune pgbouncer.
+/// 1. Apply new config using pgbouncer admin console
+/// 2. Add new values to pgbouncer.ini to preserve them after restart
+pub async fn tune_pgbouncer(
+    pgbouncer_settings: Option<HashMap<String, String>>,
+    pgbouncer_connstr: &str,
+    pgbouncer_ini_path: Option<String>,
+) -> Result<()> {
+    if let Some(pgbouncer_config) = pgbouncer_settings {
+        // Apply new config
+        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
+        let (client, connection) = connect_result.unwrap();
+        tokio::spawn(async move {
+            if let Err(e) = connection.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+
+        for (option_name, value) in pgbouncer_config.iter() {
+            info!(
+                "Applying pgbouncer setting change: {} = {}",
+                option_name, value
+            );
+            let query = format!("SET {} = {}", option_name, value);
+
+            let result = client.simple_query(&query).await;
+
+            info!("Applying pgbouncer setting change: {}", query);
+            info!("pgbouncer setting change result: {:?}", result);
+
+            if let Err(err) = result {
+                // Don't fail on error, just print it into log
+                error!(
+                    "Failed to apply pgbouncer setting change: {},  {}",
+                    query, err
+                );
+            };
+        }
+
+        // save values to pgbouncer.ini
+        // so that they are preserved after pgbouncer restart
+        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
+            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 071f22dc2b..55b66742ca 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -537,6 +537,7 @@ impl Endpoint {
             safekeeper_connstrings,
             storage_auth_token: auth_token.clone(),
             remote_extensions,
+            pgbouncer_settings: None,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/deny.toml b/deny.toml
index 079dcac679..22e39a2ca3 100644
--- a/deny.toml
+++ b/deny.toml
@@ -35,6 +35,7 @@ allow = [
     "Artistic-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
+    "CC0-1.0",
     "ISC",
     "MIT",
     "MPL-2.0",
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2a483188e4..4ff6831272 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -73,6 +73,8 @@ pub struct ComputeSpec {
 
     // information about available remote extensions
     pub remote_extensions: Option<RemoteExtSpec>,
+
+    pub pgbouncer_settings: Option<HashMap<String, String>>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json
index e2afa17ef0..ccd015ad19 100644
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -243,5 +243,9 @@
         "public_extensions": [
           "postgis"
         ]
+      },
+      "pgbouncer_settings": {
+        "default_pool_size": "42",
+        "pool_mode": "session"
       }
 }
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 804405293f..68be0b3617 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -36,6 +36,7 @@ files:
       max_client_conn=10000
       default_pool_size=64
       max_prepared_statements=0
+      admin_users=cloud_admin
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 82bbedc4ae..4f13064088 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,6 +39,7 @@ futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -50,6 +51,7 @@ nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128"] }
+once_cell = { version = "1" }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -84,11 +86,13 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+once_cell = { version = "1" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From 136aab54793816ac86a386084ed858f522d334c5 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 26 Dec 2023 14:37:09 -0800
Subject: [PATCH 54/57] Bump postgres submodule versions

---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 0bb356aa0c..03358bb0b5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d
+Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 24333abb81..a2dc225ddf 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7
+Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 863b71572b..225071f482 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb
+Subproject commit 225071f482774943854c2eec4540757e01171557
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a9575a2cb7..def4eab069 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb",
-    "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
-    "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
+    "postgres-v16": "225071f482774943854c2eec4540757e01171557",
+    "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c",
+    "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8"
 }

From e5a3b6dfd8e7b2c7f72902e33868eddf72713630 Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Wed, 27 Dec 2023 18:15:17 +0100
Subject: [PATCH 55/57] Pg stat statements reset for neon superuser (#6232)

## Problem

Extension pg_stat_statements has function pg_stat_statements_reset().
In vanilla Postgres this function can only be called by superuser role
or other users/roles explicitly granted.
In Neon no end user can use superuser role.
Instead we have neon_superuser role.
We need to grant execute on pg_stat_statements_reset() to neon_superuser

## Summary of changes

Modify the Postgres v14, v15, v16 contrib in our compute docker file to
grant execute on pg_stat_statements_reset() to neon_superuser.
(Modifying it in our docker file is preferable to changes in
neondatabase/postgres because we want to limit the changes in our fork
that we have to carry with each new version of Postgres).

Note that the interface of proc/function pg_stat_statements_reset
changed in pg_stat_statements version 1.7

So for versions up to and including 1.6 we must

`GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO
neon_superuser;`

and for versions starting from 1.7 we must

`GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO
neon_superuser;`

If we just use `GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO
neon_superuser;` for all version this results in the following error for
versions 1.7+:

```sql
neondb=> create extension pg_stat_statements;
ERROR:  function pg_stat_statements_reset() does not exist
```



## Checklist before requesting a review

- [x ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

## I have run the following test and could now invoke
pg_stat_statements_reset() using default user

```bash
(neon) peterbendel@Peters-MBP neon % kubectl get pods | grep compute-quiet-mud-88416983
compute-quiet-mud-88416983-74f4bf67db-crl4c            3/3     Running     0          7m26s
(neon) peterbendel@Peters-MBP neon % kubectl set image deploy/compute-quiet-mud-88416983 compute-node=neondatabase/compute-node-v15:7307610371
deployment.apps/compute-quiet-mud-88416983 image updated
(neon) peterbendel@Peters-MBP neon % psql postgresql://peterbendel:<secret>@ep-bitter-sunset-73589702.us-east-2.aws.neon.build/neondb
psql (16.1, server 15.5)
SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, compression: off)
Type "help" for help.

neondb=> select version();
                                              version
---------------------------------------------------------------------------------------------------
 PostgreSQL 15.5 on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
(1 row)

neondb=> create extension pg_stat_statements;
CREATE EXTENSION

neondb=> select pg_stat_statements_reset();
 pg_stat_statements_reset
--------------------------

(1 row)
```
---
 Dockerfile.compute-node | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 8db60ff85f..14ba1b5b9a 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -48,7 +48,29 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
+    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
+    # In vanilla postgres this function is limited to Postgres role superuser.
+    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
+    # so we do it here.
+    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
+    # the first loop is for pg_stat_statement extension version <= 1.6
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if echo "$old_list" | grep -q -F "$filename"; then \
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
+        fi; \
+    done; \
+    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # where pg_stat_statement_reset() got 3 additional arguments
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if ! echo "$old_list" | grep -q -F "$filename"; then \
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
+        fi; \
+    done      
 
 #########################################################################################
 #

From 1c037209c775f0330c2ffc7c5c1826487c75b0e1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 29 Dec 2023 09:32:24 +0000
Subject: [PATCH 56/57] proxy: fix compute addr parsing (#6237)

## Problem

control plane should be able to return domain names and not just IP
addresses.

## Summary of changes

1. add regression tests
2. use rsplit to split the port from the back, then trim the ipv6
brackets
---
 proxy/src/console/provider/neon.rs | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 628d98df49..5bf7b0f986 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -11,7 +11,7 @@ use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use std::{net::SocketAddr, sync::Arc};
+use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -141,7 +141,7 @@ impl Api {
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
             let mut config = compute::ConnCfg::new();
-            config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
@@ -269,9 +269,10 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     Err(ApiError::Console { status, text })
 }
 
-fn parse_host_port(input: &str) -> Option<(String, u16)> {
-    let parsed: SocketAddr = input.parse().ok()?;
-    Some((parsed.ip().to_string(), parsed.port()))
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
 }
 
 #[cfg(test)]
@@ -279,9 +280,24 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_parse_host_port() {
+    fn test_parse_host_port_v4() {
         let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
         assert_eq!(host, "127.0.0.1");
         assert_eq!(port, 5432);
     }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
 }

From f28bdb652854200e97febe9eb601a1dc2534266a Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Sat, 30 Dec 2023 13:45:31 +0000
Subject: [PATCH 57/57] Use nextest for rust unittests (#6223)

## Problem
`cargo test` doesn't support timeouts
or junit output format

## Summary of changes
- Add `nextest` to `build-tools` image
- Switch `cargo test` with `cargo nextest` on CI
- Set timeout
---
 .config/nextest.toml                 | 2 ++
 .github/workflows/build_and_test.yml | 8 ++++----
 Dockerfile.buildtools                | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)
 create mode 100644 .config/nextest.toml

diff --git a/.config/nextest.toml b/.config/nextest.toml
new file mode 100644
index 0000000000..8bccd51c6d
--- /dev/null
+++ b/.config/nextest.toml
@@ -0,0 +1,2 @@
+[profile.default]
+slow-timeout = "1m"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3091ce6d3a..78deff6e85 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -339,16 +339,16 @@ jobs:
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
-      - name: Run cargo test
+      - name: Run rust tests
         run: |
-          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -358,7 +358,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
       - name: Install rust binaries
         run: |
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index 77722f173b..c2fcd8841e 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -151,6 +151,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-hakari && \
     cargo install cargo-deny && \
     cargo install cargo-hack && \
+    cargo install cargo-nextest && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 ENV RUSTC_WRAPPER=cachepot